In [1]:
import pandas as pd
import youtube_dl
from ffprobe import FFProbe
import os
import traceback

### 1. Read file that is created in [Step1-Get music youtube url from www.last.fm]

In [4]:
df_url = pd.read_csv('./music/MusicInfo/Step1_artists_with_youtube_url.csv') 
df_url['youtubeID'] = df_url['youtube_url'].str.slice(start=-11)
print len(df_url)
df_url.head()

1446


Unnamed: 0,artistID,style,styleID,name,youtube_url,youtubeID
0,52,hop,5,Morcheeba,https://www.youtube.com/watch?v=VnCS25z18pI,VnCS25z18pI
1,96,pop,2,Fleetwood Mac,https://www.youtube.com/watch?v=GzY140MnItA,GzY140MnItA
2,995,pop,2,China Crisis,https://www.youtube.com/watch?v=FQ2OK8UkBu8,FQ2OK8UkBu8
3,9322,punk,3,Sigue Sigue Sputnik,https://www.youtube.com/watch?v=V67OOERTOEo,V67OOERTOEo
4,99,rock,0,INXS,https://www.youtube.com/watch?v=Y2Csgu8Ya2o,Y2Csgu8Ya2o


### 2. Function [make_savepath] is used to create style directories if not exists, and return the music file's name with full path.
Musics will be store in computer in the directories named with music style name, like below:  
./music/MusicDownload/**black**/a.mp3   
                ......    
./music/MusicDownload/**black**/l.mp3   
./music/MusicDownload/**country**/m.mp3    
                 ......   
./music/MusicDownload/**country**/s.mp3   
                 ......   
./music/MusicDownload/**rock**/x.mp3  
                 ......   
./music/MusicDownload/**rock**/z.mp3   

In [77]:
# create style directory 
def make_savepath(style, file_name):
    savedir = "./music/MusicDownload"
    if not os.path.exists(savedir+"/"+style):
        os.makedirs(savedir+"/"+style)
    return savedir+"/"+style + "/" + file_name

### 3. Download music from youtube
(1) Use [youtube_dl] to download music from youtube.   
(2) Because youtube URLs prepared in Step1 are linked to vedios, not audio file, just extract audio from youtube and use [ffprobe] to convert extraced audio to mp3 files.


**<font color=red>!!! Caution:</font>** It will take more than 10 hours to excute the below codes.

In [None]:
ydl_opts = {
    'format': 'bestaudio/best', # choice of quality
    'extractaudio' : True,      # only keep the audio
    'noplaylist' : True,       # only download single song, not playlist
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
}
ydl = youtube_dl.YoutubeDL(ydl_opts)
    
df_artist_with_youtube_info = pd.DataFrame()
df_error_info = pd.DataFrame()
with ydl:

    # for each row, download
    for i in range(0,len(df_url)):
        print "Downloading: %s ..." % (df_url.iloc[i]['youtubeID'])
        
        # download audio
        try:
            result = ydl.extract_info(df_url.iloc[i]['youtube_url'], download=False)
            download_name = result['title']+'-'+result['id']+'.mp3'
            save_path = make_savepath(df_url.iloc[i]['style'], download_name)
            os.rename(download_name, save_path)
            print "Downloaded and converted %s successfully!" % save_path
            df_artist_with_youtube_info.loc[i, 'file_name'] = download_name
            df_artist_with_youtube_info.loc[i, 'title'] = result['title']
            df_artist_with_youtube_info.loc[i, 'display_id'] = result['display_id']
            df_artist_with_youtube_info.loc[i, 'upload_date'] = result['upload_date']
            df_artist_with_youtube_info.loc[i, 'view_count'] = result['view_count']
            df_artist_with_youtube_info.loc[i, 'like_count'] = result['like_count']
            df_artist_with_youtube_info.loc[i, 'dislike_count'] = result['dislike_count']
            df_artist_with_youtube_info.loc[i, 'average_rating'] = result['average_rating']
            df_artist_with_youtube_info.loc[i, 'creator'] = result['creator']
            df_artist_with_youtube_info.loc[i, 'artist'] = result['artist']
        except Exception as e:
            df_error_info.loc[i, 'youtubeID'] = df_url.iloc[i]['youtubeID']
            df_error_info.loc[i, 'style'] = df_url.iloc[i]['style']
            df_error_info.loc[i, 'artistID'] = df_url.iloc[i]['artistID']
            print "Can't download audio! %s\n" % traceback.format_exc()

df_artist_with_youtube_info["file_name"] = df_artist_with_youtube_info["file_name"].str.encode(encoding="ascii", errors='replace')
df_artist_with_youtube_info["title"] = df_artist_with_youtube_info["title"].str.encode(encoding="ascii", errors='replace')
df_artist_with_youtube_info["creator"] = df_artist_with_youtube_info["creator"].str.encode(encoding="ascii", errors='replace')
df_artist_with_youtube_info["artist"] = df_artist_with_youtube_info["artist"].str.encode(encoding="ascii", errors='replace')
            
df_artist_with_youtube_info.to_csv('./music/MusicInfo/Step2_artists_with_youtube_info.csv', index=False)