# Batch Feature Collection in 5 Steps

**Hello and Welcome!**

**Please Follow The Steps Below**

**If you have any issues, please check:**
1. The CSV File that is being read in.
2. The credentials for your Spotipy Dev account.
3. Everything has been run in order.

**If you have any issues, please reach out!**

### Step 1. Please load in your batch CSV File. 

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import time 
import statistics 

from IPython.display import display
pd.options.display.max_columns = None

In [5]:
# Example File Short Path: '../data/batches/batch_1.csv'
# Example Full Path Name: '/Volumes/GoogleDrive/My Drive/DS4AWork/Spotify Team 22 Project/data/batches/batch_1.csv'
# Yours may be slightly different

PathName = './batches/batch_2.csv' #'ENTER PATH NAME HERE'

Batch_File = pd.read_csv(PathName, index_col = 0)
Batch_File

Unnamed: 0,0
140,2mxe0TnaNL039ysAj51xPQ
141,6MDME20pz9RveH9rEXvrOM
142,3Isy6kedDrgPYoTS1dazA9
143,1zNqDE7qDGCsyzJwohVaoX
144,2J0JN7EFN10G1Tty6hX0AN
...,...
275,4TrraAsitQKl821DQY42cZ
276,4sTQVOfp9vEMCemLw50sbu
277,6jJ0s89eD6GaHleKKya26X
278,4ryoUS0W8qXokfMxrlJt6O


### Step 2: Setting Up Spotipy

#### Please copy your spotify developer credentials in the box below. These can be found on the spotify developer webpage. 
    Note: Make sure to put the string in ''.

In [7]:
SPOTIPY_CLIENT_ID = 'f3fac8dc166d4eda8684e41136883787'
SPOTIPY_CLIENT_SECRET = 'fb28c681d7b8439a9f3fc3f481550c95'

In [8]:
client_credentials_manager = SpotifyClientCredentials(client_id= SPOTIPY_CLIENT_ID, 
                                                      client_secret=SPOTIPY_CLIENT_SECRET)

sp = spotipy.Spotify(client_credentials_manager= client_credentials_manager)

### Step 3: Run the code below to create the function. 

In [31]:
def get_data(list_of_artist_uris):
    print(f'There are {len(list_of_artist_uris)} artists that we found')
    
    aud_feat = []
    danceability = []
    energy = []
    key = []
    mode = []
    loudness = []
    speechiness = []
    acousticness = []
    instrumentalness = []   
    liveness = []
    valence = []
    tempo = []
    art_uri = []
    alb_uri = []
    alb_date = []
    alb_name = []
    alb_genre = []
    alb_group = []
    
    albums_set = set()
    
    for uri_id,uri in enumerate(list_of_artist_uris['0']):
        time.sleep(.3)
#         print(f'Collecting Data for {uri}' )
        print(f"Collecting Data for {uri_id + 1} out of {len(list_of_artist_uris['0'])} artists.")
        
        artist_result = sp.artist_albums(uri)
    
        for i in range(len(artist_result['items'])):

            art_uri.append(uri)
            alb_date.append(artist_result['items'][i]['release_date'])
            alb_name.append(artist_result['items'][i]['name']) 
        
            temp_uri = artist_result['items'][i]['id']
            alb_uri.append(temp_uri)
            albums_set.add(temp_uri)
        
            temp_name = artist_result['items'][i]['name']
        
            alb_group.append(artist_result['items'][i]['album_group'])
            artist_result['items'][i]['album_group'] 

            if temp_uri in albums_set:
                l1 = []
                l2 = []
                l3 = []
                l4 = []
                l5 = []
                l6 = []
                l7 = []
                l8 = []
                l9 = []
                l10 = []
                l11 = []
            
                album_result = sp.album(temp_uri)
                alb_genre.append(album_result['genres'])
            
                for j in range(len(album_result['tracks']['items'])):
                    feature = sp.audio_features(album_result['tracks']['items'][j]['id'])[0]
                    if type(feature) == dict:
                        l1.append(feature['danceability'])
                        l2.append(feature['energy'])
                        l3.append(feature['key'])
                        l4.append(feature['mode'])
                        l5.append(feature['loudness'])
                        l6.append(feature['speechiness'])
                        l7.append(feature['acousticness'])
                        l8.append(feature['instrumentalness'])
                        l9.append(feature['liveness'])
                        l10.append(feature['valence'])
                        l11.append(feature['tempo'])

                if(len(l1)>1):
                    danceability.append(statistics.mean(l1))
                else: 
                    danceability.append(l1[-1])

                if(len(l2)>1):
                    energy.append(statistics.mean(l2))
                else: 
                    energy.append(l2[-1])

                if(len(l5)>1):
                    loudness.append(statistics.mean(l5))
                else: 
                    loudness.append(l5[-1])

                if(len(l6)>1):
                    speechiness.append(statistics.mean(l6))
                else: 
                    speechiness.append(l6[-1])

                if(len(l7)>1):
                    acousticness.append(statistics.mean(l7))
                else: 
                    acousticness.append(l7[-1])

                if(len(l8)>1):
                    instrumentalness.append(statistics.mean(l8))
                else: 
                    instrumentalness.append(l8[-1])

                if(len(l9)>1):
                    liveness.append(statistics.mean(l9))
                else: 
                    liveness.append(l9[-1])

                if(len(l10)>1):
                    valence.append(statistics.mean(l10))
                else: 
                    valence.append(l10[-1])

                if(len(l11)>1):
                    tempo.append(statistics.mean(l11))
                else: 
                    tempo.append(l11[-1])

                if(len(l3)>1):    
                    key.append(statistics.median(l3))
                else: 
                    key.append(l3[-1])

                if(len(l4)>1):
                    mode.append(statistics.median(l4))
                else: 
                    mode.append(l4[-1])

            else: 
                print(f'Found a duplicate album: {temp_uri}. Will not calculate audio features')
                danceability.append('DUPLICATE')
                energy.append('DUPLICATE')
                key.append('DUPLICATE')
                mode.append('DUPLICATE')
                loudness.append('DUPLICATE')
                speechiness.append('DUPLICATE')
                acousticness.append('DUPLICATE')
                instrumentalness.append('DUPLICATE')   
                liveness.append('DUPLICATE')
                valence.append('DUPLICATE')
                tempo.append('DUPLICATE')
    
    print("Creating Artist Dataframe")
    album_data = pd.DataFrame({'Artist URI': art_uri,
                               'Album URI': alb_uri,
                               'Album Group': alb_group,
                               'Album Name': alb_name,
                               'Album Release Date' : alb_date,
                               'Album Danceability' : danceability,
                               'Album Energy' : energy,
                               'Album Key' : key,
                               'Album Loudness' : loudness,
                               'Album Speechiness' : speechiness,
                               'Album Acousticness' : acousticness,
                               'Album Instrumentalness' : instrumentalness,
                               'Album Liveness' : liveness,
                               'Album Valence': valence,
                               'Album Tempo' : tempo,
                               'Album Genres': alb_genre
                          })
    print('Complete!')
    return album_data

### Step 4: Run the code below to run the function. 
    Note: This can take up to an hour and a half. Please be patient. If there is an error, please wait a bit and start back at step 1. 

In [32]:
# Batch_File.iloc[0:10]

In [25]:
Output_50_100 = get_data(Batch_File.iloc[50:80])

There are 30 artists that we found
Collecting Data for 5veVxxPm1vzgi6pO2iVA8L
Collecting Data for 7qPLO2XOUaRrRxkvLZ3AEK
Collecting Data for 3t5xRXzsuZmMDkQzgOX35S
Collecting Data for 7DMveApC7UnC2NPfPvlHSU
Collecting Data for 0AJUYWKztkg5dnrHDIBv47
Collecting Data for 0DdDnziut7wOo6cAYWVZC5
Collecting Data for 6RHKEd9dpzQ4c09x8Zdaxu
Collecting Data for 4PVn1b2WnnXdq80C7uaZLZ
Collecting Data for 4bvWFV51SAEocM22uiDOza
Collecting Data for 05o1zW8b6PV6rEquXr74sB
Collecting Data for 1Fr6agZ6iSM5Ynn2k4C8sc
Collecting Data for 23sYU61n9f1CzYi8NJhAXS
Collecting Data for 7dGJo4pcD2V6oG8kP0tJRR
Collecting Data for 1Hsdzj7Dlq2I7tHP7501T4
Collecting Data for 4ehtJnVumNf6xzSCDk8aLB
Collecting Data for 0fA0VVWsXO9YnASrzqfmYu
Collecting Data for 0u2FHSq3ln94y5Q57xazwf
Collecting Data for 5lXfVoQxVgC5fpjkVqvNYn
Collecting Data for 5mnS9jJdKQQcRSqFu5YPVe
Collecting Data for 3hPPNy5OBzCU1icQO7Nrok
Collecting Data for 1Oa0bMld0A3u5OTYfMzp5h
Collecting Data for 16oZKvXb6WkQlVAjwo2Wbg
Collecting Data for

In [33]:
Output_80_110 = get_data(Batch_File.iloc[80:110])

There are 30 artists that we found
Collecting Data for 1 out of 30 artists.
Collecting Data for 2 out of 30 artists.
Collecting Data for 3 out of 30 artists.
Collecting Data for 4 out of 30 artists.
Collecting Data for 5 out of 30 artists.
Collecting Data for 6 out of 30 artists.
Collecting Data for 7 out of 30 artists.
Collecting Data for 8 out of 30 artists.
Collecting Data for 9 out of 30 artists.
Collecting Data for 10 out of 30 artists.
Collecting Data for 11 out of 30 artists.
Collecting Data for 12 out of 30 artists.
Collecting Data for 13 out of 30 artists.
Collecting Data for 14 out of 30 artists.
Collecting Data for 15 out of 30 artists.
Collecting Data for 16 out of 30 artists.
Collecting Data for 17 out of 30 artists.
Collecting Data for 18 out of 30 artists.
Collecting Data for 19 out of 30 artists.
Collecting Data for 20 out of 30 artists.
Collecting Data for 21 out of 30 artists.
Collecting Data for 22 out of 30 artists.
Collecting Data for 23 out of 30 artists.
Collecti

In [None]:
Output_110_140 = get_data(Batch_File.iloc[110::])

There are 30 artists that we found
Collecting Data for 1 out of 30 artists.
Collecting Data for 2 out of 30 artists.
Collecting Data for 3 out of 30 artists.
Collecting Data for 4 out of 30 artists.
Collecting Data for 5 out of 30 artists.
Collecting Data for 6 out of 30 artists.
Collecting Data for 7 out of 30 artists.
Collecting Data for 8 out of 30 artists.
Collecting Data for 9 out of 30 artists.
Collecting Data for 10 out of 30 artists.
Collecting Data for 11 out of 30 artists.
Collecting Data for 12 out of 30 artists.
Collecting Data for 13 out of 30 artists.
Collecting Data for 14 out of 30 artists.
Collecting Data for 15 out of 30 artists.
Collecting Data for 16 out of 30 artists.
Collecting Data for 17 out of 30 artists.
Collecting Data for 18 out of 30 artists.
Collecting Data for 19 out of 30 artists.
Collecting Data for 20 out of 30 artists.
Collecting Data for 21 out of 30 artists.
Collecting Data for 22 out of 30 artists.
Collecting Data for 23 out of 30 artists.
Collecti

### Step 5: Saving the File. 
    Note: You may need to change the output file depending on your system, but it should work out of the box.
          Please match the number in the file name to your batch number. 

In [20]:
# Output_0_10

In [16]:
# Please change the number in the file name to correspond to your batch number. 
# Your batch number is in the input file name as well. 

Output_0_10.to_csv('./batch_2_data_0_10.csv')

In [19]:
Output_10_20.to_csv('./batch_2_data_10_20.csv')

In [22]:
Output_20_50.to_csv('./batch_2_data_20_50.csv')

In [26]:
Output_50_100.to_csv('./batch_2_data_50_80.csv')

In [34]:
Output_80_110.to_csv('./batch_2_data_80_110.csv')

In [None]:
Output_110_140.to_csv('./batch_2_data_110_140.csv')

# Done! Please Put The Output File Into Google Drive. 

# -----------------------------------------------------------------------------------------

# Ignore This Code

#### Creating Artist URI Batches

In [None]:
Input_Data = pd.read_csv('../data/2021.01.09 spotify_us_all_fe.csv')

artist_set = pd.DataFrame(Input_Data['Artist URI'].unique())

artist_set[0:140].to_csv('../data/batches/batch_1.csv')
artist_set[140:280].to_csv('../data/batches/batch_2.csv')
artist_set[280:420].to_csv('../data/batches/batch_3.csv')
artist_set[420:560].to_csv('../data/batches/batch_4.csv')
artist_set[560:700].to_csv('../data/batches/batch_5.csv')
artist_set[700:840].to_csv('../data/batches/batch_6.csv')
artist_set[840:980].to_csv('../data/batches/batch_7.csv')
artist_set[980:1120].to_csv('../data/batches/batch_8.csv')
artist_set[1120:1260].to_csv('../data/batches/batch_9.csv')
artist_set[1260:1400].to_csv('../data/batches/batch_10.csv')
artist_set[1400:1540].to_csv('../data/batches/batch_11.csv')
artist_set[1540:].to_csv('../data/batches/batch_12.csv')



#### Merging Output Files