# Data Collection

In [None]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
import timeit

### 1. Clean Kaggle Dataset for Spotify Artist IDs List 1

In [None]:
#Create dataframe from Spotify artist codes csv
df = pd.read_csv('data/spotify-artist-codes.csv')

In [None]:
#Re-name columns to "ArtistName" and "SpotifyID"
df = df.rename(columns={'1:43': 'artist_name', 
                        'spotify:artist:39EHxSQAIaWusRqSI9xoyF':'id_string'})

In [None]:
#Create second dataframe of artist name and ID that was deleted by column rename
df2 = pd.DataFrame({"artist_name":["1:43"], 
                    "id_string":["spotify:artist:39EHxSQAIaWusRqSI9xoyF"]}) 

In [None]:
#Append second dataframe to re-add the lost artist and corresponding ID
df = df.append(df2, ignore_index=True)

In [None]:
#Split id_string and expand into three new columns
df['spotify'] = df.id_string.str.split(':', n=2, expand = True)[0]
df['artist'] = df.id_string.str.split(':', n=2, expand = True)[1]
df['artist_id'] = df.id_string.str.split(':', n=2, expand = True)[2]

In [None]:
#Drop columns
df = df.drop(["spotify", "artist"], axis=1)

In [None]:
#Save to CSV
df.to_csv("data/spotify-artist-codes-cleaned.csv")

### 2. Retrieving Artist Info from Spotify API (Kaggle Artist IDs)

In [None]:
#Creating artist_codes dataframe
artist_codes = pd.read_csv('data/spotify-artist-codes-cleaned.csv')
artist_codes = artist_codes.drop(["Unnamed: 0"], axis=1)

In [None]:
#Create lists of artists codes
#Limiting list size to better manage retrieving Spotify data
artist_ids_0_20k = artist_codes['artist_id'][0:20000].tolist()
artist_ids_20k_40k = artist_codes['artist_id'][20000:40000].tolist()
artist_ids_40k_60k = artist_codes['artist_id'][40000:60000].tolist()
artist_ids_60k_82k = artist_codes['artist_id'][60000:].tolist()

In [None]:
#Setup Spotipy
cid = '<Client ID>'
secret = '<Secret ID>'

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
#For loop to retrieve artist information by each 'artist_id'
#Repeated for each list of ~20k artist codes
start = timeit.default_timer()

artist_info = []

for code in artist_ids_60k_82k:
    artist_results = sp.artist(code)
    new_artist_dict = {}
    new_artist_dict['artist_name'] = artist_results['name']
    new_artist_dict['artist_id'] = artist_results['id']
    new_artist_dict['artist_uri'] = artist_results['uri']
    new_artist_dict['artist_genres'] = artist_results['genres']
    new_artist_dict['artist_followers'] = artist_results['followers']['total']
    new_artist_dict['artist_popularity'] = artist_results['popularity']
    
    artist_info.append(new_artist_dict)
    time.sleep(0.1)

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

In [None]:
#Creating DataFrame from lists
#Repeated for each list of ~20k artist codes
artist_info_60k_82k_df = pd.DataFrame(artist_info, columns=['artist_id', 
                                                            'artist_name', 
                                                            'artist_genres', 
                                                            'artist_followers', 
                                                            'artist_popularity', 
                                                            'artist_uri'])



In [None]:
#Save DataFrame to CSV
#Repeated for each list of ~20k artist codes
artist_info_60k_82k_df.to_csv("data/artist_info_60k_82k_df.csv")

### 3. Combine Artist Info DataFrames to CSV

In [None]:
#Read CSV files into DataFrames
artist_info_0_20k_df2 = pd.read_csv('data/artist_info_0_20k_df.csv')
artist_info_20k_40k_df2 = pd.read_csv('data/artist_info_20k_40k_df.csv')
artist_info_40k_60k_df2 = pd.read_csv('data/artist_info_40k_60k_df.csv')
artist_info_60k_82k_df2 = pd.read_csv('data/artist_info_60k_82k_df.csv')

In [None]:
#List of DataFrames
artist_info_dataframes_list = [artist_info_0_20k_df2, 
                               artist_info_20k_40k_df2, 
                               artist_info_40k_60k_df2, 
                               artist_info_60k_82k_df2]

In [None]:
#Concatenate dataframes to create artist_info_df
artist_info_df = pd.concat(artist_info_dataframes_list)

In [None]:
#Export DataFrame to CSV
artist_info_df.to_csv("data/artist_info1_df.csv")

### 4. Retrieving Related Artists from Spotify API

In [None]:
#Creating artist_codes dataframe
artist_codes = pd.read_csv('data/spotify-artist-codes-cleaned.csv')
artist_codes = artist_codes.drop(["Unnamed: 0"], axis=1)

In [None]:
#Create lists of artists codes
#Limiting list size to better manage retrieving Spotify data
artist_ids_0_10k = artist_codes['artist_id'][0:10000].tolist()
artist_ids_10k_20k = artist_codes['artist_id'][10000:20000].tolist()
artist_ids_20k_30k = artist_codes['artist_id'][20000:30000].tolist()
artist_ids_30k_40k = artist_codes['artist_id'][30000:40000].tolist()
artist_ids_40k_50k = artist_codes['artist_id'][40000:50000].tolist()
artist_ids_50k_60k = artist_codes['artist_id'][50000:60000].tolist()
artist_ids_60k_70k = artist_codes['artist_id'][60000:70000].tolist()
artist_ids_70k_82k = artist_codes['artist_id'][70000:].tolist()

In [None]:
#For loop to retrieve top_track information by each 'artist_id'
#Repeated for each list of ~10k artist codes
start = timeit.default_timer()

related_artists = []

for code in artist_ids_70k_82k:
    related_artist_results = sp.artist_related_artists(code)['artists']
    for artist in related_artist_results:    
        new_artist_dict = {}
        new_artist_dict['artist_id'] = code
        new_artist_dict['related_artist_id'] = artist['id']
        new_artist_dict['related_artist_name'] = artist['name']
        related_artists.append(new_artist_dict)
    time.sleep(0.1)
    
stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

In [None]:
#Creating DataFrame from lists
#Repeated for each list of ~20k artist codes
related_artists_70k_82k_df = pd.DataFrame(related_artists, columns=['artist_id', 'related_artist_id', 'related_artist_name'])

In [None]:
#Save DataFrame to CSV
#Repeated for each list of ~20k artist codes
related_artists_70k_82k_df.to_csv("data/related_artists_70k_82k_df.csv")

### 5. Combine Related Artist DataFrames to CSV

In [None]:
#Read CSV files into DataFrames
related_artists_0_10k_df2 = pd.read_csv('data/related_artists_0_10k_df.csv')
related_artists_10k_20k_df2 = pd.read_csv('data/related_artists_10k_20k_df.csv')
related_artists_20k_30k_df2 = pd.read_csv('data/related_artists_20k_30k_df.csv')
related_artists_30k_40k_df2 = pd.read_csv('data/related_artists_30k_40k_df.csv')
related_artists_40k_50k_df2 = pd.read_csv('data/related_artists_40k_50k_df.csv')
related_artists_50k_60k_df2 = pd.read_csv('data/related_artists_50k_60k_df.csv')
related_artists_60k_70k_df2 = pd.read_csv('data/related_artists_60k_70k_df.csv')
related_artists_70k_82k_df2 = pd.read_csv('data/related_artists_70k_82k_df.csv')

In [None]:
#List of DataFrames
related_artists_dataframes_list = [related_artists_0_10k_df2, 
                                   related_artists_10k_20k_df2,
                                   related_artists_20k_30k_df2,
                                   related_artists_30k_40k_df2,
                                   related_artists_40k_50k_df2,
                                   related_artists_50k_60k_df2,
                                   related_artists_60k_70k_df2,
                                   related_artists_70k_82k_df2]

In [None]:
#Concatenate dataframes to create artist_info_df
related_artists_df = pd.concat(related_artists_dataframes_list)

In [None]:
#Export DataFrame to CSV
related_artists_df.to_csv("data/related_artists_df.csv")

### 6. Create Artist ID List 2

In [None]:
artist_info_df = pd.read_csv('data/artist_info_df.csv')

In [None]:
related_artists_df = pd.read_csv('data/related_artists_df.csv')

In [None]:
related_artist_list = list(related_artists_df['related_artist_id'].unique())

In [None]:
start_artist_list = list(artist_info_df['artist_id'].unique())

In [None]:
unique_new_artist_list = [x for x in related_artist_list if x not in start_artist_list]

In [None]:
artist_list_2_df = pd.DataFrame(unique_new_artist_list, columns=['artist_id'])

In [None]:
artist_list_2_df.to_csv("data/artist_list_2.csv")

### 7. Retrieving Artist Info from Spotify API (Related Artist IDs)

In [None]:
#Creating artist_codes dataframe
artist_codes2 = pd.read_csv('data/artist_list_2.csv')
artist_codes2 = artist_codes2.drop(["Unnamed: 0"], axis=1)

In [None]:
#Create lists of artists codes
#Limiting list size to better manage retrieving Spotify data
artist_ids2_0_20k = artist_codes2['artist_id'][0:20000].tolist()
artist_ids2_20k_40k = artist_codes2['artist_id'][20000:40000].tolist()
artist_ids2_40k_60k = artist_codes2['artist_id'][40000:60000].tolist()
artist_ids2_60k_80k = artist_codes2['artist_id'][60000:80000].tolist()
artist_ids2_80k_100k = artist_codes2['artist_id'][80000:100000].tolist()
artist_ids2_100k_120k = artist_codes2['artist_id'][100000:120000].tolist()
artist_ids2_120k_140k = artist_codes2['artist_id'][120000:140000].tolist()
artist_ids2_140k_160k = artist_codes2['artist_id'][140000:160000].tolist()
artist_ids2_160k_180k = artist_codes2['artist_id'][160000:180000].tolist()
artist_ids2_180k_200k = artist_codes2['artist_id'][180000:200000].tolist()
artist_ids2_200k_220k = artist_codes2['artist_id'][200000:220000].tolist()
artist_ids2_220k_240k = artist_codes2['artist_id'][220000:].tolist()

In [None]:
#For loop to retrieve artist information by each 'artist_id'
#Repeated for each list of ~20k artist codes
start = timeit.default_timer()

artist_info = []

for code in artist_ids2_220k_240k:
    artist_results = sp.artist(code)
    new_artist_dict = {}
    new_artist_dict['artist_name'] = artist_results['name']
    new_artist_dict['artist_id'] = artist_results['id']
    new_artist_dict['artist_uri'] = artist_results['uri']
    new_artist_dict['artist_genres'] = artist_results['genres']
    new_artist_dict['artist_followers'] = artist_results['followers']['total']
    new_artist_dict['artist_popularity'] = artist_results['popularity']
    
    artist_info.append(new_artist_dict)
    time.sleep(0.1)

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

In [None]:
#Creating DataFrame from lists
#Repeated for each list of ~20k artist codes
artist_info2_220k_240k_df = pd.DataFrame(artist_info, columns=['artist_id', 
                                                            'artist_name', 
                                                            'artist_genres', 
                                                            'artist_followers', 
                                                            'artist_popularity', 
                                                            'artist_uri'])

In [None]:
#Save DataFrame to CSV
#Repeated for each list of ~20k artist codes
artist_info2_220k_240k_df.to_csv("data/artist_info2_220k_240k_df.csv")

### 8. Combine Related Artist Artist Info DataFrames to CSV 

In [None]:
#Read CSV files into DataFrames
artist_info2_0_20k_df2 = pd.read_csv('data/artist_info2_0_20k_df.csv')
artist_info2_20k_40k_df2 = pd.read_csv('data/artist_info2_20k_40k_df.csv')
artist_info2_40k_60k_df2 = pd.read_csv('data/artist_info2_40k_60k_df.csv')
artist_info2_60k_80k_df2 = pd.read_csv('data/artist_info2_60k_80k_df.csv')
artist_info2_80k_100k_df2 = pd.read_csv('data/artist_info2_80k_100k_df.csv')
artist_info2_100k_120k_df2 = pd.read_csv('data/artist_info2_100k_120k_df.csv')
artist_info2_120k_140k_df2 = pd.read_csv('data/artist_info2_120k_140k_df.csv')
artist_info2_140k_160k_df2 = pd.read_csv('data/artist_info2_140k_160k_df.csv')
artist_info2_160k_180k_df2 = pd.read_csv('data/artist_info2_160k_180k_df.csv')
artist_info2_180k_200k_df2 = pd.read_csv('data/artist_info2_180k_200k_df.csv')
artist_info2_200k_220k_df2 = pd.read_csv('data/artist_info2_200k_220k_df.csv')
artist_info2_220k_240k_df2 = pd.read_csv('data/artist_info2_220k_240k_df.csv')

In [None]:
#List of DataFrames
artist_info2_dataframes_list = [artist_info2_0_20k_df2,
                                artist_info2_20k_40k_df2,
                                artist_info2_40k_60k_df2,
                                artist_info2_60k_80k_df2,
                                artist_info2_80k_100k_df2,
                                artist_info2_100k_120k_df2,
                                artist_info2_120k_140k_df2,
                                artist_info2_140k_160k_df2,
                                artist_info2_160k_180k_df2,
                                artist_info2_180k_200k_df2,
                                artist_info2_200k_220k_df2,
                                artist_info2_220k_240k_df2]

In [None]:
#Concatenate dataframes to create artist_info_df
artist_info2_df = pd.concat(artist_info2_dataframes_list)

In [None]:
#Export DataFrame to CSV
artist_info2_df.to_csv("data/artist_info2_df.csv")

### 9. Combine Artist Info for List 1 & List 2 to Final CSV

In [None]:
#Creating artist_info1 dataframe
artist_info1_df = pd.read_csv('data/artist_info1_df.csv')
artist_info1_df = artist_info1_df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)

In [None]:
#Creating artist_info2 dataframe
artist_info2_df = pd.read_csv('data/artist_info2_df.csv')
artist_info2_df = artist_info2_df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)

In [None]:
#Concatenate dataframes to create artist_info_df
artist_info_df = pd.concat([artist_info1_df, artist_info2_df])

In [None]:
#Export DataFrame to CSV
artist_info_df.to_csv("data/spotify_artists.csv")