In [1]:
from pytube import YouTube
from pytube import Playlist
from pydub import AudioSegment
from bs4 import BeautifulSoup
import os
from glob import glob
import openpyxl
import numpy as np
import pandas as pd
import json
import librosa
import math
from sklearn.preprocessing import StandardScaler

### 列出 ID Title Duration URL

In [2]:
def DownloadAndExtracFeature(youtube_lists_url, mp4_path, wav_path, csv_path):
    
    # 設定參數值
    SR = 22050
    N_MELS = 20
    N_FFT = 2048
    HOP_LEN = 512
    N_CHROMA = 12
            
    # 遍歷播放清單
    video_urls = []

    for yt_list_url in youtube_lists_url:

        # 播放列表URL
        playlist_url = yt_list_url

        # 創建Playlist對象
        playlist = Playlist(yt_list_url)

        # 解析播放列表
        playlist._video_regex = r"\"url\":\"(/watch\?v=[\w-]*)"

        # 播放列表URL
        video_urls += playlist.video_urls

    # 提取每首歌曲的編號、歌名、長度、URL、擷取到的特徵
    data = {
        'title':[],
        'duration':[],
        'url':[],
        'centroid_mean':[],
        'roloff_mean':[],
        'flux_mean':[],
        'rmse_mean':[],
        'zcr_mean':[],
        'bandwidth_mean':[],
        'flatness_mean':[],
    }
    for i in range(N_MELS):
        data[f'mels_{i}_mean'] = []
    for i in range(N_CHROMA):
        data[f'chroma_{i}_mean'] = []
    
    song_id = 0
    
    # 遍歷播放列表URL
    for i, url in enumerate(video_urls):
        try:
            # 創建YouTube對象
            youtube = YouTube(url)

            # 選擇最高質量的影片流
            video_stream = youtube.streams.get_highest_resolution()

            # 下載影片
            file_name = f"{song_id}.mp4"
            video_file = video_stream.download(filename = file_name, output_path = mp4_path)

            # 儲存影片路徑
            video_paths = f"{mp4_path}/{file_name}"
            
            print(f"成功下載: {file_name}:{youtube.title}")
            
            try:
                # 將 MP4 轉為 WAV 存起來
                mp4_audio = AudioSegment.from_file(video_paths, "mp4")
                mp4_audio.export(f"{wav_path}/{song_id}.wav", format="wav")

                print(f"成功轉換: {song_id}.wav")

                dir = mp4_path
                for f in os.listdir(dir):
                    os.remove(os.path.join(dir, f))
            
                try:
                    # 擷取特徵
                    # 載入聲音資料
                    # 取得音頻資料、採樣率
                    signal, sample_rate = librosa.load(f"{wav_path}/{song_id}.wav", sr=SR)

                    # 提取質心特徵（spectral centroid）
                    centroid = librosa.feature.spectral_centroid(y=signal, sr=SR, n_fft=N_FFT, hop_length=HOP_LEN).ravel()
                    centroid_mean = centroid.mean()

                    # 提取衰減特徵（spectral rolloff）
                    roloff = librosa.feature.spectral_rolloff(y=signal, sr=SR, n_fft=N_FFT, hop_length=HOP_LEN).ravel().mean()
                    roloff_mean = roloff.mean()

                    # 提取過零率特徵（zero-crossing rate）
                    zcr = librosa.feature.zero_crossing_rate(y=signal, frame_length=N_FFT, hop_length=HOP_LEN).ravel().mean()
                    zcr_mean = zcr.mean()

                    # 提取均方根能量特徵（root mean square, RMS）
                    rmse = librosa.feature.rms(y=signal, frame_length=N_FFT, hop_length=HOP_LEN).ravel().mean()
                    rmse_mean = rmse.mean()

                    # 提取起點強度特徵（onset strength）
                    flux = librosa.onset.onset_strength(y=signal, sr=SR).ravel().mean()
                    flux_mean = flux.mean()

                    # 提取頻譜帶寬特徵（spectral bandwidth）
                    bandwidth = librosa.feature.spectral_bandwidth(y=signal, sr=SR, n_fft=N_FFT, hop_length=HOP_LEN).ravel().mean()
                    bandwidth_mean = bandwidth.mean()

                    # 提取頻譜平坦度特徵（spectral flatness）
                    flatness = librosa.feature.spectral_flatness(y=signal, n_fft=N_FFT, hop_length=HOP_LEN).ravel().mean()
                    flatness_mean = flatness.mean()

                    # 提取色度特徵（Chromagram）
                    chroma = librosa.feature.chroma_stft(y=signal, sr=SR, hop_length=HOP_LEN)

                    # 提取梅爾頻譜特徵（Mel spectrogram）
                    mels = librosa.feature.melspectrogram(y=signal, sr=SR, n_fft=N_FFT, n_mels=N_MELS, hop_length=HOP_LEN)
                    mels = librosa.power_to_db(mels)

                    data['title'].append(youtube.title)
                    data['duration'].append(youtube.length)
                    data['url'].append(url)
                    
                    data['centroid_mean'].append(centroid_mean)
                    data['roloff_mean'].append(roloff_mean)
                    data['zcr_mean'].append(zcr_mean)
                    data['rmse_mean'].append(rmse_mean)
                    data['flux_mean'].append(flux_mean)
                    data['bandwidth_mean'].append(bandwidth_mean)
                    data['flatness_mean'].append(flatness_mean)
                    for idx, v_mels in enumerate(mels):
                        data[f"mels_{idx}_mean"].append(v_mels.ravel().mean())

                    for idx, v_chroma in enumerate(chroma):
                        data[f"chroma_{idx}_mean"].append(v_chroma.ravel().mean())
                    
                    print(f"成功取得特徵: {song_id}.wav")
                        
                    song_id += 1
            
                except Exception as e:
                    print(f"取得特徵失敗: {str(e)}")
                
            except Exception as e:
                print(f"轉換失败: {str(e)}")
            
        except Exception as e:
            print(f"下載失败: {str(e)}")
       
    return data

In [6]:
# 設定路徑

mp4_path = "/Users/phoebe/DJ-Box_model/dataset/mp4_files"

youtube_lists_url = ["https://youtube.com/playlist?list=PLQdn7YisXz3NBAGjVjgtKlnPLj5TaO975"]
# youtube_lists_url = ["https://youtube.com/playlist?list=PLHh4zQmRdiTKHuwFPtxwp5sUcY5tV04e3",
#                      "https://youtube.com/playlist?list=PLHeL32XSCxMwKZT60Q2EtQMvq2Ock4MxT",
#                      "https://youtube.com/playlist?list=PL58yl8ZTfuDqHWTj-VaIoBiNDbKxWB-V4",
#                      "https://youtube.com/playlist?list=PLOVz_6E6CsBf1dbv4vwkzPxeShJ8uQgCR",
#                      "https://youtube.com/playlist?list=PLrTJ8GEaDuZZWWng0XnTPdvWRpQHv6U2w",
#                      "https://youtube.com/playlist?list=PLMhqKsyei5dpED9kCu33f3SZmJKX7RBtk",
#                      "https://youtube.com/playlist?list=PL0p51VbAATAd04aLOKapIkgOyCYv9f14w",
#                      "https://youtube.com/playlist?list=PLpPrPUzzk7gaFxIGV3Xqcx21kC25M_MPt",
#                      "https://youtube.com/playlist?list=PLfQvHGiud6RrnCjiBsw5BVbVRPlBnWJWv",
#                      "https://youtube.com/playlist?list=PL4cJJVRBlBo3t8Fasf-6RMICtTl64PJuW",
#                      "https://youtube.com/playlist?list=PLnvkw2ct2N15zH3F3ZcvsZOajsnAyutem",
#                      "https://youtube.com/playlist?list=PLnXc-kZVDOPo7S5PtFcuvJ7CxqC4Ol-5W",
#                      "https://youtube.com/playlist?list=PLYTFtaYC02-z_lHBSxaoeNKm_64TVDAHf",
#                      "https://youtube.com/playlist?list=PLvtFwLg0kO-95uJ--lmezwksA3HID5Wir",
#                      "https://youtube.com/playlist?list=PLMC9KNkIncKtPzgY-5rmhvj7fax8fdxoj",
#                      "https://youtube.com/playlist?list=PLomnF-6GejKEfXSEvgnjJtFB4C7P0e9kj",
#                      "https://youtube.com/playlist?list=PLTVRe9aECZ6eFMc7LUtXta79M-kq8hCM8",
#                      "https://youtube.com/playlist?list=PL64G6j8ePNureM8YCKy5nRFyzYf8I2noy",
#                      "https://youtube.com/playlist?list=PLLCY4xuRnY_dNEf-2ilAVORZep_hJkzYl",
#                      "https://youtube.com/playlist?list=PL402o3EF36WFYAkkbooB3rlLrka9zQWjB",
#                      "https://youtube.com/playlist?list=PL_k9bFPONrNeOiaJRDK9qSSAbjXaEeoPS",
#                      "https://youtube.com/playlist?list=PLAG4o1kV1ZE-WhdTfjAllCpenPWjKx71O",
#                      "https://youtube.com/playlist?list=PL3mpgmWgY2iDwH2s8NnqudTMdUkMKGcQi",
#                      "https://youtube.com/playlist?list=PL5a6hmJETjjZWcknCDwH-AvqriMK7jzH8"]

wav_path = "/Users/phoebe/DJ-Box_model/dataset/music_dataset"

csv_path = '/Users/phoebe/DJ-Box_model/input/songs_feature_v3.csv'

# 下載並取得特徵
data = DownloadAndExtracFeature(youtube_lists_url, mp4_path, wav_path, csv_path)

成功下載: 0.mp4:Gangnam Style
成功轉換: 0.wav
成功取得特徵: 0.wav
成功下載: 1.mp4:Uptown Funk (Official Video)
成功轉換: 1.wav
成功取得特徵: 1.wav
成功下載: 2.mp4:Work (Official Video)
成功轉換: 2.wav
成功取得特徵: 2.wav
成功下載: 3.mp4:Without Me
成功轉換: 3.wav
成功取得特徵: 3.wav
成功下載: 4.mp4:As It Was
成功轉換: 4.wav
成功取得特徵: 4.wav
成功下載: 5.mp4:Scary Slope
成功轉換: 5.wav
成功取得特徵: 5.wav
成功下載: 6.mp4:Dance The Night (From Barbie The Album)
成功轉換: 6.wav
成功取得特徵: 6.wav
成功下載: 7.mp4:Rush (Official Music Video)
成功轉換: 7.wav
成功取得特徵: 7.wav
成功下載: 8.mp4:Love On You (Dirty)
成功轉換: 8.wav
成功取得特徵: 8.wav
成功下載: 9.mp4:Starboy
成功轉換: 9.wav
成功取得特徵: 9.wav
成功下載: 10.mp4:Dusa
成功轉換: 10.wav
成功取得特徵: 10.wav
成功下載: 11.mp4:Lean On
成功轉換: 11.wav
成功取得特徵: 11.wav
成功下載: 12.mp4:Believer (Official Music Video)
成功轉換: 12.wav
成功取得特徵: 12.wav
成功下載: 13.mp4:Roar
成功轉換: 13.wav
成功取得特徵: 13.wav
成功下載: 14.mp4:Wan Billz - What’s The Difference Ft. Internet Money (Official Video)
成功轉換: 14.wav
成功取得特徵: 14.wav
成功下載: 15.mp4:Numb
成功轉換: 15.wav
成功取得特徵: 15.wav
成功下載: 16.mp4:Let Me Love You
成功轉換: 16.wav
成功取得特徵: 16.wa

成功下載: 127.mp4:I'm an Albatraoz (Audio)
成功轉換: 127.wav
成功取得特徵: 127.wav
成功下載: 128.mp4:High Hopes
成功轉換: 128.wav
成功取得特徵: 128.wav
成功下載: 129.mp4:Perfect Strangers
成功轉換: 129.wav
成功取得特徵: 129.wav
成功下載: 130.mp4:Don't Start Now
成功轉換: 130.wav
成功取得特徵: 130.wav
成功下載: 131.mp4:Solo
成功轉換: 131.wav
成功取得特徵: 131.wav
成功下載: 132.mp4:Toxic (Instrumental)
成功轉換: 132.wav
成功取得特徵: 132.wav
成功下載: 133.mp4:Mr. Saxobeat
成功轉換: 133.wav
成功取得特徵: 133.wav
成功下載: 134.mp4:Burn
成功轉換: 134.wav
成功取得特徵: 134.wav
成功下載: 135.mp4:Roses (Imanbek Remix)
成功轉換: 135.wav
成功取得特徵: 135.wav
成功下載: 136.mp4:One Kiss (Official Video)
成功轉換: 136.wav
成功取得特徵: 136.wav
成功下載: 137.mp4:Hung Up
成功轉換: 137.wav
成功取得特徵: 137.wav
成功下載: 138.mp4:Hey Mama
成功轉換: 138.wav
成功取得特徵: 138.wav
成功下載: 139.mp4:How Deep Is Your Love (Audio)
成功轉換: 139.wav
成功取得特徵: 139.wav
成功下載: 140.mp4:Single Ladies (Put a Ring on It) (Video Version)
成功轉換: 140.wav
成功取得特徵: 140.wav
成功下載: 141.mp4:International Love
成功轉換: 141.wav
成功取得特徵: 141.wav
成功下載: 142.mp4:Summer
成功轉換: 142.wav
成功取得特徵: 142.wav
成功下載: 143.mp

In [7]:
# 儲存成 CSV
df = pd.DataFrame(data)
df.to_csv(csv_path)

In [8]:
# 刪除 MP4
dir = mp4_path
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))
dir = wav_path
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))