In [4]:
#imports
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth

from pymongo import MongoClient

In [5]:
#Mongo connection
mClient = MongoClient('mongodb://localhost:27017')

In [6]:
#Spotify connection
client_id = 'CLIENT-ID-HERE'
client_secret = 'CLIENT-SECRET-HERE'

username = 'pablo8421'

scope='user-library-read'
redirect_uri='http://localhost:7777/callback'

In [7]:
#Spotify api connection
#sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
#                                                           client_secret=client_secret))


sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri=redirect_uri,
                                               scope=scope))

In [8]:
#Test search
results = sp.search(q='wolf alice', limit=20)
for idx, track in enumerate(results['tracks']['items']):
    print(idx, track['name'])

0 Don’t Delete The Kisses
1 Violin Concerto No. 3 in G Major, K. 216: 2. Adagio (Cadenza: Robert Levin)
2 The Last Man on Earth
3 Violin Concerto No.4 In D Major, K.218: 2. Andante cantabile (Cadenza By Robert Levin)
4 Bros
5 Silk
6 Moaning Lisa Smile
7 Blush
8 Giant Peach
9 Silk
10 Beautifully Unconventional
11 Song to the Siren - Recorded at Strongroom Studios, London
12 Wicked Game (Live from Spotify London)
13 Planet Hunter
14 Piano Concerto No. 24 in C Minor, K. 491: III. Allegretto
15 Don't Delete The Kisses
16 Freazy
17 Your Loves Whore
18 White Leather
19 Teenage Headache Dreams (with Ellie Rowsell)


In [9]:
#Obtain my personal list of music
#Overwrites if duplicated

limit = 50
current_offset = 0
#From 0 to 3000, with limit steps
for current_offset in range(0,3000,limit):
    
    results = sp.current_user_saved_tracks(limit=limit, offset=current_offset)
    
    for idx, item in enumerate(results['items']):
        #Obtain tracks
        track = item['track']
        track['_id'] = track['id'] #This line may be unnecesary, but why not
        #Insert into mongo
        mClient.spotify.track_list.replace_one({'_id':track['id']}, track, upsert=True)
    
    #Just in case
    time.sleep(0.5)

### Create artists collection in mongo
```
db.track_list.aggregate([
    {$project:{artists:1}},
    {$unwind:"$artists"},
    {$group:{_id:"$artists.id", name:{$first:"$artists.name"}}},
    {$out:"artists"}
])
```

In [10]:
#Fill information for artists
#It goes one by one
for artist in mClient.spotify.artists.find():
    sp_artist = sp.artist('spotify:artist:'+artist['_id'])
    sp_artist['_id'] = artist['_id']
    mClient.spotify.artists.replace_one({'_id':artist['_id']}, sp_artist, upsert=True)
    #Just in case
    time.sleep(0.01)

### Get Audio features

In [11]:
max_group_size = 90
track_count = 0
track_list = ''

for track in mClient.spotify.track_list.find():
    if track_count <= max_group_size:
        track_list += ','+track['_id']
        track_count += 1
    else:
        track_list = track_list[1:]
        for audio_feature in  sp.audio_features(track_list):
            mClient.spotify.audio_features.replace_one({'_id':audio_feature['id']}, audio_feature, upsert=True)
        #Restart variables
        track_count = 0
        track_list = ''
        #Sleep time
        time.sleep(0.2)
        
track_list = track_list[1:]
for audio_feature in  sp.audio_features(track_list):
    mClient.spotify.audio_features.replace_one({'_id':audio_feature['id']}, audio_feature, upsert=True)

[Audio Features](https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject)

* **Danceability:** Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.
* **Acousticness:** A measure from 0.0 to 1.0 of whether the track is acoustic.
* **Energy:** Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.
* **Instrumentalness:** Predicts whether a track contains no vocals. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content.
* **Liveness:** Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live.
* **Loudness:** The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track. Values typical range between -60 and 0 db.
* **Speechiness:** Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value.
* **Tempo:** The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.
* **Valence:** A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

## Merging the datasets

In [12]:
import pandas as pd

In [13]:
# Tracklist dataframe
tracks_list = []
for track in mClient.spotify.track_list.find():
    track = {k: track[k] for k in ('id','duration_ms','explicit','name','popularity','artists')}
    track['artist'] = track['artists'][0]['name']
    track['artist_id'] = track['artists'][0]['id']
    track.pop('artists')
    
    tracks_list.append(track)

tracks = pd.DataFrame(tracks_list)

# Artists dataframe
artists_list = []
for artist in mClient.spotify.artists.find():
    #artist = {k: artist[k] for k in ('id','name','popularity','genres')}
    artist = {k: artist[k] for k in ('id','genres')}
    artist['genres'] = ','.join(artist['genres'])
    
    artists_list.append(artist)
    
artists = pd.DataFrame(artists_list)

#Audio features dataframe
audio_feature_list = []
for audio_feature in mClient.spotify.audio_features.find():
    audio_feature = {k: audio_feature[k] for k in ('id','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature')}

    audio_feature_list.append(audio_feature)
    
audio_features = pd.DataFrame(audio_feature_list)

In [14]:
tracks.head()

Unnamed: 0,id,duration_ms,explicit,name,popularity,artist,artist_id
0,5ovYwJmYcFvSvn6nRQOPyL,194200,False,Pachuco,65,Maldita Vecindad Y Los Hijos Del 5to. Patio,6WvDtNFHOWHfiNy8NVHujT
1,0t0Kl5jxxV3s8bdpILkgmd,215906,False,Canción para Mi Muerte,59,Sui Generis,5lCqW8vcd4TyTHzGIuKdH8
2,3UBXrdDWmiWpV6TY4O2rS3,150013,False,Rompan Todo (Break It All),37,Los Shakers,5iObuPpXhQnJ40Pk2Ydkwr
3,1O84ZPSywpYdZ7oYbCfOAe,170746,False,Piccola sbronza,41,Selton,3MR638pug3LlFmzFGAzjTl
4,3ti3uqYYxGPWyxfeaxjjJW,301853,False,Escapist,30,Nightwish,2NPduAUeLVsfIauhRwuft1


In [15]:
artists.head()

Unnamed: 0,id,genres
0,6ilYV5oF8whllOnm4VZlYR,"hokkaido indie,j-rock"
1,5MmVJVhhYKQ86izuGHzJYA,"dance pop,europop,pop"
2,2uYWxilOVlUdk4oV9DvwqK,"brooklyn indie,indie pop"
3,61CPKXT0bcKj8MKTNTMOXa,"chanson,electro-pop francais,french indie pop,..."
4,7JYdpWAsiqzrmMB3qxkEbI,"baithak gana,dutch jazz"


In [16]:
audio_features.head()

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,5ovYwJmYcFvSvn6nRQOPyL,0.649,0.985,7,-8.016,1,0.0413,0.112,0.043,0.214,0.496,107.825,4
1,0t0Kl5jxxV3s8bdpILkgmd,0.457,0.277,0,-18.023,1,0.034,0.649,4e-06,0.101,0.482,130.833,4
2,3UBXrdDWmiWpV6TY4O2rS3,0.44,0.904,6,-7.402,0,0.0531,0.149,0.0,0.275,0.843,185.124,4
3,1O84ZPSywpYdZ7oYbCfOAe,0.627,0.624,0,-6.479,1,0.245,0.209,0.0,0.222,0.76,193.942,4
4,3ti3uqYYxGPWyxfeaxjjJW,0.512,0.914,7,-3.564,0,0.0395,0.000508,0.627,0.34,0.35,143.974,4


In [17]:
#Merging all 3 dataframes into a single one

tracks_info = pd.merge(tracks, audio_features, how='inner', left_on='id', right_on='id', suffixes=('_tr', '_af'))
tracks_info = pd.merge(tracks_info, artists, how='inner', left_on='artist_id', right_on='id', suffixes=('_tr', '_at'))

tracks_info['id'] = tracks_info['id_tr']
tracks_info = tracks_info[['id', 'name', 'artist', 'duration_ms', 'explicit', 'popularity',
       'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'genres']]


tracks_info.head()

Unnamed: 0,id,name,artist,duration_ms,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres
0,5ovYwJmYcFvSvn6nRQOPyL,Pachuco,Maldita Vecindad Y Los Hijos Del 5to. Patio,194200,False,65,0.649,0.985,7,-8.016,1,0.0413,0.112,0.043,0.214,0.496,107.825,4,"latin alternative,latin rock,mexican pop,mexic..."
1,0t0Kl5jxxV3s8bdpILkgmd,Canción para Mi Muerte,Sui Generis,215906,False,59,0.457,0.277,0,-18.023,1,0.034,0.649,4e-06,0.101,0.482,130.833,4,"argentine rock,cantautor,latin alternative,lat..."
2,3UBXrdDWmiWpV6TY4O2rS3,Rompan Todo (Break It All),Los Shakers,150013,False,37,0.44,0.904,6,-7.402,0,0.0531,0.149,0.0,0.275,0.843,185.124,4,"classic garage rock,freakbeat,rock nacional"
3,1O84ZPSywpYdZ7oYbCfOAe,Piccola sbronza,Selton,170746,False,41,0.627,0.624,0,-6.479,1,0.245,0.209,0.0,0.222,0.76,193.942,4,"italian alternative,italian indie pop,milan indie"
4,5eIRcfC177ZdsZ0ufta3Oz,Fammi Scrollare (feat. Willie Peyote & Emicida),Selton,173493,False,27,0.758,0.907,2,-5.365,0,0.141,0.141,2e-06,0.175,0.768,104.957,4,"italian alternative,italian indie pop,milan indie"


In [19]:
#Save to a csv
tracks_info.to_csv('D:/Pablo/clases/UJM/2. Semester, 2021/Data Mining/Project/tracks_info.csv',index=False)