# Last.fm Data

Now that I have all of the Pitchfork data, I will supplement it with artist-level data from Last.fm.

In [3]:
# necessary imports

import pandas as pd
import numpy as np

In [71]:
# reading in data
artists_df = pd.read_csv('data/artists.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [72]:
# it's big!

artists_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1466083 entries, 0 to 1466082
Data columns (total 10 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   mbid              1466083 non-null  object 
 1   artist_mb         1466075 non-null  object 
 2   artist_lastfm     986756 non-null   object 
 3   country_mb        662368 non-null   object 
 4   country_lastfm    211498 non-null   object 
 5   tags_mb           119946 non-null   object 
 6   tags_lastfm       381075 non-null   object 
 7   listeners_lastfm  986760 non-null   float64
 8   scrobbles_lastfm  986760 non-null   float64
 9   ambiguous_artist  1466083 non-null  bool   
dtypes: bool(1), float64(2), object(7)
memory usage: 102.1+ MB


In [None]:
artists_df.describe()

In [8]:
# creating a subset to look at

artists_sample1000_df = artists_df.sample(10000)

In [11]:
artists_sample1000_df.describe()

Unnamed: 0,listeners_lastfm,scrobbles_lastfm
count,6804.0,6804.0
mean,6022.656,98311.19
std,41813.54,921482.6
min,0.0,0.0
25%,10.0,42.0
50%,94.0,574.0
75%,873.25,6155.75
max,1143047.0,32408060.0


In [14]:
artists_df.columns

Index(['mbid', 'artist_mb', 'artist_lastfm', 'country_mb', 'country_lastfm',
       'tags_mb', 'tags_lastfm', 'listeners_lastfm', 'scrobbles_lastfm',
       'ambiguous_artist'],
      dtype='object')

In [15]:
# comparing two country columns

artists_df.country_mb.value_counts().head()

United States     165623
United Kingdom     71405
Germany            51645
Japan              46324
France             32179
                   ...  
Nauru                  1
Niue                   1
Kiribati               1
Tokelau                1
Tuvalu                 1
Name: country_mb, Length: 223, dtype: int64

In [16]:
artists_df.country_lastfm.value_counts().head()

United States                                  25999
United Kingdom                                 13994
Japan                                          12769
Germany                                        12258
France                                          8493
                                               ...  
Austria; Iran                                      1
Mozambique; United States                          1
Bosnia and Herzegovina; Peru; United States        1
Australia; France; Poland                          1
France; Norway; Sweden; United States              1
Name: country_lastfm, Length: 8120, dtype: int64

In [18]:
# subset of just US artists

artists_us_df = artists_df[artists_df.country_mb == 'United States']

## Cleaning

### Duplicate artist names

In [73]:
# many duplicate names, soem in special characters, all in title caps

artists_df.artist_mb.value_counts()

Vortex            83
Moloch            76
Darkness          68
Sol               58
Amok              49
                  ..
Jordan Perlson     1
Bergtatt           1
digidigi66         1
Reviver Gene       1
山崎まさよし             1
Name: artist_mb, Length: 1352996, dtype: int64

In [74]:
# changing to lower case 

artists_df['artist_mb_clean'] = artists_df['artist_mb'].astype(str).str.lower()

In [75]:
# duplicates across countries with dupe or null listen/scrobbles. 
# we can aggregate globally and take first row

artists_df[artists_df.artist_mb == 'Moloch'].head(3)

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist,artist_mb_clean
69675,8294c83d-f358-4e39-b6e6-70ec197aa587,Moloch,Moloch,United States,Brazil; Israel; Poland; Ukraine; United Kingdo...,,black metal; Sludge; dark ambient; raw black m...,9782.0,144936.0,True,moloch
69676,818ef79c-7777-4c78-8cae-41f6a33abe23,Moloch,Moloch,United Kingdom,Brazil; Israel; Poland; Ukraine; United Kingdo...,,black metal; Sludge; dark ambient; raw black m...,9782.0,144936.0,True,moloch
69677,515cf468-8640-4742-be60-3dda5f298823,Moloch,Moloch,Ukraine,Brazil; Israel; Poland; Ukraine; United Kingdo...,black metal,black metal; Sludge; dark ambient; raw black m...,9782.0,144936.0,True,moloch


In [76]:
# sorting by artist and listens and dropping duplicates

artists_df_clean = artists_df.sort_values(by=['artist_mb_clean','listeners_lastfm'], ascending=False)
artists_df_clean = artists_df_clean.drop_duplicates(keep='first', subset=['artist_mb_clean'])

In [77]:
# no more duplicate names

artists_df_clean.artist_mb_clean.value_counts()

mattias bergqvist     1
star tatooed & ava    1
a sorrowful dream     1
kenneth lovelace      1
the dirty royals      1
                     ..
zendad                1
senor goofy           1
nicole meyer          1
javier vega           1
山崎まさよし                1
Name: artist_mb_clean, Length: 1345381, dtype: int64

In [78]:
artists_df_clean[artists_df_clean['artist_mb'] == 'Moloch']

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist,artist_mb_clean
69675,8294c83d-f358-4e39-b6e6-70ec197aa587,Moloch,Moloch,United States,Brazil; Israel; Poland; Ukraine; United Kingdo...,,black metal; Sludge; dark ambient; raw black m...,9782.0,144936.0,True,moloch


In [79]:
# consolidated >100k rows

print(artists_df.shape)
artists_df_clean.shape

(1466083, 11)


(1345381, 11)

In [7]:
# saving clean lastfm df as a csv

artists_df_clean.to_csv('data/lastfm_data_clean.csv')

NameError: name 'artists_df_clean' is not defined

## Matching with Pitchfork Data

In [8]:
# loading pitchfork data

pitchfork_df = pd.read_csv('data/pitchfork_data_clean.csv', index_col = 0)
artists_df_clean = pd.read_csv('data/lastfm_data_clean.csv', index_col = 0)

  mask |= (ar1 == a)


In [5]:
pitchfork_df.shape

(18389, 13)

In [9]:
# merging two dfs

merged_df = (pitchfork_df.merge(artists_df_clean, 
                                how='left', 
                                left_on='artist', 
                                right_on='artist_mb_clean'))

In [10]:
len(merged_df)

18391

In [11]:
# no new nulls

merged_df.isnull().sum()

album_review_score       0
artist                   4
album_title              3
best_new_music           0
review_pub_date          0
review_pub_year          0
album_release_year       0
album_genre              0
label                   47
review_author            0
reviewid                 0
review_delay             0
api_formatted            0
mbid                  2452
artist_mb             2454
artist_lastfm         2603
country_mb            4059
country_lastfm        6389
tags_mb               9080
tags_lastfm           3086
listeners_lastfm      2602
scrobbles_lastfm      2602
ambiguous_artist      2452
artist_mb_clean       2456
dtype: int64

In [12]:
merged_df.head(3)

Unnamed: 0,album_review_score,artist,album_title,best_new_music,review_pub_date,review_pub_year,album_release_year,album_genre,label,review_author,...,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist,artist_mb_clean
0,3.4,aberfeldy,young forever,0,2004-08-30,2004,2004.0,rock,rough trade,marc hogan,...,Aberfeldy,Aberfeldy,United Kingdom,Scotland; United Kingdom,,indie pop; Scottish; indie; twee; seen live; f...,50666.0,707132.0,False,aberfeldy
1,7.4,aarktica,pure tone audiometry,0,2003-04-08,2003,2003.0,electronic,silber,eric carr,...,Aarktica,Aarktica,United States,United States,,ambient; post-rock; shoegaze; electronic; indi...,36852.0,492264.0,False,aarktica
2,8.1,aarktica,or you could just go through your whole life a...,0,2002-05-16,2002,2002.0,electronic,darla,christopher dare,...,Aarktica,Aarktica,United States,United States,,ambient; post-rock; shoegaze; electronic; indi...,36852.0,492264.0,False,aarktica


In [13]:
# saving merged df as a csv

merged_df.to_csv('data/lastfm_data-pitchfork_artists_only.csv')

Now that the Pitchfork and Last.fm data is merged, we can move on to getting individual track-level features from Spotify. 