In [1]:
import librosa
import os
import pandas as pd
import numpy as np
import traceback

In [30]:
#the path of downloaded music
#music_files_dir = './music/MusicDownload/'
music_files_dir = 'D:/PostGraduateStudy/DataScience/MusicRecommander/music/MusicDownload/'
#the path where the extracted music feature files will be saved
#features_files_dir = './music/MusicFeatures/'
features_files_dir = 'D:/PostGraduateStudy/DataScience/MusicRecommander/music/MusicFeatures/'
#the path of the file of music's other infomation
#music_info_dir = './music/MusicInfo/'
music_info_dir = 'D:/PostGraduateStudy/DataScience/MusicRecommander/music/MusicInfo/'
#the separator of the path
separator = '/'

In [31]:
#find all music files' name, include all sub dir
def scan_files(file_dir):
    files_list=[]
    for dirpath,dirnames,filenames in os.walk(file_dir):
        for filename in filenames:
                files_list.append(dirpath + separator + filename)
    #return files' name with full dir
    return files_list 

In [18]:
#change all music files' name from [title+'-'+youtubeID+'.mp3'] to [youtubeID+'.mp3']
def change_names(file_dir):
    music_files = scan_files(file_dir)
    for i in range(0, len(music_files)):
        old_name_with_full_dir = music_files[i]
        index_last_separator = old_name_with_full_dir.rfind(separator)
        new_name_with_full_dir = old_name_with_full_dir[0:index_last_separator+1] + old_name_with_full_dir[-15:]
        os.rename(old_name_with_full_dir, new_name_with_full_dir)

### 1. Change all music files' name from [title+'-'+youtubeID+'.mp3'] to [youtubeID+'.mp3']
Because downloaded music files' name includes the title, like 'China Crisis - Wishful Thinking (7'' Single Edit) Music Video-FQ2OK8UkBu8.mp3',
all music files' name will be change to youtubeID, like 'FQ2OK8UkBu8.mp3'.

In [21]:
change_names(music_files_dir)




### 2. Find all music files' name, include all sub directories.

In [34]:
music_files_name_with_full_dir_list = scan_files(music_files_dir)
df_music_files = pd.DataFrame({'musicFileNameWithDir':music_files_name_with_full_dir_list})
df_music_files['musicFileName'] = df_music_files['musicFileNameWithDir'].str.slice(start=-15)
df_music_files['youtubeID'] = df_music_files['musicFileNameWithDir'].str.slice(start=-15, stop=-4)

print len(df_music_files)
df_music_files.head()

1354


Unnamed: 0,musicFileNameWithDir,musicFileName,youtubeID
0,D:/PostGraduateStudy/DataScience/MusicRecomman...,-1VISLfRDfg.mp3,-1VISLfRDfg
1,D:/PostGraduateStudy/DataScience/MusicRecomman...,-ubLj8mLqOc.mp3,-ubLj8mLqOc
2,D:/PostGraduateStudy/DataScience/MusicRecomman...,01yUzXQctcM.mp3,01yUzXQctcM
3,D:/PostGraduateStudy/DataScience/MusicRecomman...,1SFAIryH0Hs.mp3,1SFAIryH0Hs
4,D:/PostGraduateStudy/DataScience/MusicRecomman...,2a5ZdbZ4l70.mp3,2a5ZdbZ4l70


### 3. Use librosa to extract features [melspec] for each music mp3 file, save to feature data to file [youtubeID+'.npy']
For each music file, extract features every 20 seconds, not include the first 10 seconds and last 10 seconds of the music, because normally there is  silence in the first 10 seconds and last 10 seconds. The feature data will be all zero for the silent period.

In [33]:
df_error = pd.DataFrame()
error_count = 0
#for each mp3 file, extract melspec features
for i in range(0,len(df_music_files)):
    #the melspec features file's name with full path in which melspec feature data are saved.
    features_file_name_with_full_dir = features_files_dir+df_music_files.iloc[i]['youtubeID']+'.npy'

    try:
        #if the melspec feature file already exists, skip this music
        if os.path.exists(features_file_name_with_full_dir):
            #print 'file:[' + df_music_files.iloc[i]['youtubeID']+ '.npy] is exist already! Process next song!'
            continue
        music_file_name_with_full_dir = df_music_files.iloc[i]['musicFileNameWithDir']

        melspec_list = []
        #get the duration (in seconds) of an audio time
        duration = librosa.get_duration(filename=music_file_name_with_full_dir)
        #don't get music features of the first 10 seconds and last 10 seconds 
        duration = duration - 20
        #because extracting music features every 20 seconds, caculate the times of extracting music features 
        times = int(duration//20)
        for j in range(0,times):
            y, sr = librosa.load(music_file_name_with_full_dir, sr=11025, offset=(10+j*20), duration=20)
            melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=128)
            melspec_list.append(melspec)

        #put all lists of music feature in an np.array, so for each music, there is a three dimensions np.array
        x = np.array(melspec_list)

        np.save(features_files_dir+df_music_files.iloc[i]['youtubeID'], x)
        
    except Exception as e:
        df_error.loc[error_count, 'fileName'] = music_file_name_with_full_dir
        error_count = error_count + 1
        print music_file_name_with_full_dir
        print "Can't extract features! %s\n" % traceback.format_exc()
        continue
        
#if there is exception when extracting features, write to error.csv
if len(df_error) > 0:
    df_error.to_csv(music_info_dir + 'Step3_extract_features_error.csv')

In [35]:
#For loading saved music features files test
y = np.load(features_files_dir+'2a5ZdbZ4l70.npy')
y.shape

(17L, 128L, 431L)

### 4. Integrate all music infomation together, and create a csv file for data traing in next step
(1) Get all extracted music features file's name and youtube ID.   
(2) Combine with csv files that are created in [Step1-Get music youtube url from www.last.fm] and [Step2-Download music from Youtube]

In [37]:
#Step4:integrate music features file, music mp3 file and youtube infomation of music
#find all music features files' name, and youtube ID
music_features_files_name_with_full_dir_list = scan_files(features_files_dir)
df_music_features_files = pd.DataFrame({'musicFeaturesFileNameWithDir':music_features_files_name_with_full_dir_list})
df_music_features_files['musicFeaturesFileName'] = df_music_features_files['musicFeaturesFileNameWithDir'].str.slice(start=-15)
df_music_features_files['youtubeID'] = df_music_features_files['musicFeaturesFileNameWithDir'].str.slice(start=-15, stop=-4)

#read the file in which the youtube information  of the music is stored
df_music_youtube_info = pd.read_csv(music_info_dir + 'Step2_artists_with_youtube_info.csv') 
df_music_youtube_info.head()

#read the file in which the style and styleID of the music is stored
df_music_style = pd.read_csv(music_info_dir + 'Step1_artists_with_youtube_url.csv')
df_music_style['youtubeID'] = df_music_style['youtube_url'].str.slice(start=-11)

df_integrated_music_info=pd.DataFrame()
#based on the music feature files, add music files' name (mp3 file's name)
df_integrated_music_info = df_music_features_files.join(df_music_files.set_index('youtubeID'), how='left', on='youtubeID')
#based on the music feature files, add music's youtube info: 
#include file_name, title, upload_date, view_count, like_count, dislike_count, dislike_count,average_rating,creator,artist 
df_integrated_music_info = df_integrated_music_info.join(df_music_youtube_info.set_index('display_id'), how='left', on='youtubeID')
#based on the music feature files, add music's style and styleID
df_integrated_music_info = df_integrated_music_info.join(df_music_style.set_index('youtubeID'), how='left', on='youtubeID')


df_integrated_music_info.to_csv(music_info_dir + 'Step3_integrated_music_info.csv')
df_integrated_music_info.head()

Unnamed: 0,musicFeaturesFileNameWithDir,musicFeaturesFileName,youtubeID,musicFileNameWithDir,musicFileName,file_name,title,upload_date,view_count,like_count,dislike_count,average_rating,creator,artist,artistID,style,styleID,name,youtube_url
0,D:/PostGraduateStudy/DataScience/MusicRecomman...,-1VISLfRDfg.npy,-1VISLfRDfg,D:/PostGraduateStudy/DataScience/MusicRecomman...,-1VISLfRDfg.mp3,Lurker Of Chalice - Piercing Where They Might-...,Lurker Of Chalice - Piercing Where They Might,20081120.0,79880.0,890.0,15.0,4.933702,,,5866,black,6,Lurker of Chalice,https://www.youtube.com/watch?v=-1VISLfRDfg
1,D:/PostGraduateStudy/DataScience/MusicRecomman...,-49noOAFsG8.npy,-49noOAFsG8,D:/PostGraduateStudy/DataScience/MusicRecomman...,-49noOAFsG8.mp3,Saxon - Princess of the Night--49noOAFsG8.mp3,Saxon - Princess of the Night,20071128.0,6215492.0,29076.0,669.0,4.910096,Saxon,Saxon,3476,metal,1,Saxon,https://www.youtube.com/watch?v=-49noOAFsG8
2,D:/PostGraduateStudy/DataScience/MusicRecomman...,-4k9ovLfKV8.npy,-4k9ovLfKV8,D:/PostGraduateStudy/DataScience/MusicRecomman...,-4k9ovLfKV8.mp3,Jennette McCurdy - So close lyrics--4k9ovLfKV8...,Jennette McCurdy - So close lyrics,20140121.0,33503.0,375.0,13.0,4.865979,Jennette McCurdy,Jennette McCurdy,1683,country,7,Jennette McCurdy,https://www.youtube.com/watch?v=-4k9ovLfKV8
3,D:/PostGraduateStudy/DataScience/MusicRecomman...,-59jGD4WrmE.npy,-59jGD4WrmE,D:/PostGraduateStudy/DataScience/MusicRecomman...,-59jGD4WrmE.mp3,"Lil Wayne, Wiz Khalifa & Imagine Dragons w/ Lo...","Lil Wayne, Wiz Khalifa & Imagine Dragons w/ Lo...",20160624.0,553746262.0,3867917.0,148948.0,4.851403,Imagine Dragons,Imagine Dragons,527,hop,5,Lil' Wayne,https://www.youtube.com/watch?v=-59jGD4WrmE
4,D:/PostGraduateStudy/DataScience/MusicRecomman...,-5Ri8GY57SI.npy,-5Ri8GY57SI,D:/PostGraduateStudy/DataScience/MusicRecomman...,-5Ri8GY57SI.mp3,Kellie Pickler - Red High Heels--5Ri8GY57SI.mp3,Kellie Pickler - Red High Heels,20091003.0,18045596.0,49566.0,2427.0,4.813365,Kellie Pickler,Kellie Pickler,5266,country,7,Kellie Pickler,https://www.youtube.com/watch?v=-5Ri8GY57SI
