In [13]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import sqlite3
import matplotlib.pyplot as plt
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set working directory
os.chdir('C:/Users/andre/Dropbox/Penn MCIT/CIT 550 Database/Project')

# Read each table

## Spotify API

- More than 1.2M songs collected from Spotify’s API
- The dataset includes features like name, artists, album, release date, key, explicit flag, duration, popularity, danceability, and more
- The data was created in December 2020. Each row represents a single track
- Link to source https://www.kaggle.com/rodolfofigueroa/spotify-12m-songs

In [2]:
df_spotify = pd.read_csv('./Data/tracks_features.csv.zip')
df_spotify.shape

(1204025, 24)

In [16]:
df_spotify.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02


## Album ratings
- Features user and critic review aggregated by Album of the Year and Metacritic
- The dataset includes more than 30K rows of albums and aggregated critic and user review scores
- Link to source https://www.kaggle.com/kauvinlucas/30000-albums-aggregated-review-ratings

In [3]:
album = pd.read_csv('./Data/album_ratings.csv.zip')
album.shape

(32358, 16)

In [4]:
album.head()

Unnamed: 0,Artist,Title,Release Month,Release Day,Release Year,Format,Label,Genre,Metacritic Critic Score,Metacritic Reviews,Metacritic User Score,Metacritic User Reviews,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews
0,Neko Case,Middle Cyclone,March,3,2009,LP,ANTI-,Alt-Country,79.0,31.0,8.7,31.0,79,25,78,55
1,Jason Isbell & The 400 Unit,Jason Isbell & The 400 Unit,February,17,2009,LP,Thirty Tigers,Country Rock,70.0,14.0,8.4,7.0,73,11,73,8
2,Animal Collective,Merriweather Post Pavilion,January,20,2009,LP,Domino,Psychedelic Pop,89.0,36.0,8.5,619.0,92,30,87,1335
3,Bruce Springsteen,Working on a Dream,January,27,2009,LP,Columbia Records,Rock,72.0,29.0,7.9,101.0,70,23,66,38
4,Andrew Bird,Noble Beast,January,20,2009,LP,Fat Possum,Singer-Songwriter,79.0,29.0,8.7,47.0,74,24,78,44


# Pitchfork database
- Includes over 18K Pitchfork (an online music magazine) review going back to January 1999
- The database contains separate tables on artists, content, genres, labels, reviews, and years
- Link to source https://www.kaggle.com/nolanbconaway/pitchfork-data

In [4]:
# Connect to SQLlite
con = sqlite3.connect("./Data/database.sqlite")

tables = [
          'artists', 
          'content',
          'genres',
          'labels',
          'reviews',
          'years'
]

for tbl in tables:
  print(f'---{tbl}---')
  print(pd.read_sql_query(f"SELECT * FROM {tbl}", con).head(2))
  print('===================================')

---artists---
   reviewid          artist
0     22703  massive attack
1     22721        krallice
---content---
   reviewid                                            content
0     22703  “Trip-hop” eventually became a ’90s punchline,...
1     22721  Eight years, five albums, and two EPs in, the ...
---genres---
   reviewid       genre
0     22703  electronic
1     22721       metal
---labels---
   reviewid      label
0     22703     virgin
1     22721  hathenter
---reviews---
   reviewid         title          artist  \
0     22703     mezzanine  massive attack   
1     22721  prelapsarian        krallice   

                                                 url  score  best_new_music  \
0  http://pitchfork.com/reviews/albums/22703-mezz...    9.3               0   
1  http://pitchfork.com/reviews/albums/22721-prel...    7.9               0   

        author  author_type    pub_date  pub_weekday  pub_day  pub_month  \
0  nate patrin  contributor  2017-01-08            6        8       

In [None]:
# Need to pull STD via pandas. not sure why SQL won't do STDEV
pd.read_sql_query('select * from years', con)['year'].std()

In [5]:
# Make a pandas df with just the columns we need
query_str = \
"""
SELECT 
    t1.reviewid
    , t1.title as album_name
    , t1.artist
    , t1.url as pf_url
    , t1.score as pf_score
    , t1.author as pf_author
    , t1.pub_date as pf_pubdate
    , t2.content as pf_review
FROM reviews t1 
    JOIN content t2 ON t1.reviewid = t2.reviewid
"""

pf = pd.read_sql_query(query_str, con)
pf.shape

(18401, 8)

In [7]:
pf.head()

Unnamed: 0,reviewid,album_name,artist,pf_url,pf_score,pf_author,pf_pubdate,pf_review
0,22703,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,nate patrin,2017-01-08,"“Trip-hop” eventually became a ’90s punchline,..."
1,22721,prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,zoe camp,2017-01-07,"Eight years, five albums, and two EPs in, the ..."
2,22659,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,david glickman,2017-01-07,Minneapolis’ Uranium Club seem to revel in bei...
3,22661,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,jenn pelly,2017-01-06,Kleenex began with a crash. It transpired one ...
4,22725,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,kevin lozano,2017-01-06,It is impossible to consider a given release b...


# Pre-processing
- Convert all strings to lowercase
- Select columns of interest
- Entity resolution (see how many albums overlap between the 3 tables)
- Remove rows with missing album or song names

In [36]:
def getFirstArtist(s):
    s = s.replace("[", "").replace("]", "")
    first_element = s.find(",")
    return s[:first_element].replace("'", "")
    
#     first_idx = s.find("[") + 1
#     return s[first_idx:(s[first_idx:].find("]")+first_idx)]

In [37]:
# List all the preprocessings to do here
tbls_dict = {
    'spotify': {
        'data': df_spotify,
        'cols_to_lc': ['name', 'album', 'artists'],
        'cols_to_select': 'all', # Select all
        'cols_mi': ['name', 'album']
    },
    'ratings': {
        'data': album,
        'cols_to_lc': ['Artist', 'Title', 'Label', 'Genre'],
        'cols_to_select': [
            'Artist', 'Title', 'Format', 'Label', 'Genre', 
            'AOTY Critic Score', 'AOTY Critic Reviews', 'AOTY User Score', 'AOTY User Reviews'
        ],
        'cols_mi': ['Artist', 'Title']
    },
    'pitchfork': {
        'data': pf,
        'cols_to_lc': ['album_name', 'artist', 'pf_author'],
        'cols_to_select': 'all', # Select all since we already did the subset during the SQL query
        'cols_mi': ['album_name', 'artist']
    }
}

# Perform the preprocessing steps
for k, v in tbls_dict.items():
    print(f'<<{k}>>')
    
    # To lowercase
    v['data'] = v['data'].apply(lambda d:d.str.lower() if d.name in v['cols_to_lc'] else d)
    print(f'-- To lowercase: {v["cols_to_lc"]}')
    
    # Select
    if v['cols_to_select'] != 'all':
        v['data'] = v['data'][v['cols_to_select']].copy()
    print(f'-- Selecting columns {v["cols_to_select"]}')
          
    # Remove rows with missing album or song names
    for col in v['cols_mi']:
        print(f'-- Removing {v["data"][col].isna().sum()} rows from {k} due to {col}')
        v['data'] = v['data'][v['data'][col].notna()].copy()
        
    print('')
    
# One thing specific to the Spotify data: It lists all artists for a given song
# For the sake of joining with other datasets, pull out the first artist
tbls_dict['spotify']['data']['first_artist'] = tbls_dict['spotify']['data']['artists'].apply(lambda s:getFirstArtist(s))

<<spotify>>
-- To lowercase: ['name', 'album', 'artists']
-- Selecting columns all
-- Removing 0 rows from spotify due to name
-- Removing 0 rows from spotify due to album

<<ratings>>
-- To lowercase: ['Artist', 'Title', 'Label', 'Genre']
-- Selecting columns ['Artist', 'Title', 'Format', 'Label', 'Genre', 'AOTY Critic Score', 'AOTY Critic Reviews', 'AOTY User Score', 'AOTY User Reviews']
-- Removing 0 rows from ratings due to Artist
-- Removing 1 rows from ratings due to Title

<<pitchfork>>
-- To lowercase: ['album_name', 'artist', 'pf_author']
-- Selecting columns all
-- Removing 0 rows from pitchfork due to album_name
-- Removing 0 rows from pitchfork due to artist



In [43]:
# Merge Spotify with album ratings
df = tbls_dict['spotify']['data'].merge(tbls_dict['ratings']['data'], 
                                        left_on = ['album', 'first_artist'], 
                                        right_on = ['Title', 'Artist'], 
                                        how = 'outer',
                                        indicator = True
                                       )
df.rename(columns = {'_merge':'_merge1'}, inplace = True)

# Merge Pitchfork reviews
df = df.merge(tbls_dict['pitchfork']['data'], 
              left_on = ['album', 'first_artist'], 
              right_on = ['album_name', 'artist'], 
              how = 'outer',
              indicator = True
             )
df.rename(columns = {'_merge':'_merge2'}, inplace = True)

df.shape

(1248159, 44)

In [47]:
# This is basically the Venn diagram of album joins across the 3 datasets
a = df.drop_duplicates(['album', 'first_artist'])
pd.crosstab(a['_merge1'], a['_merge2'], margins = True)

_merge2,left_only,both,All
_merge1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
left_only,181550,903,182453
right_only,1,0,1
both,3581,1168,4749
All,185132,2071,187203


# WIP - trying to find album overlaps

In [31]:
len(df_spotify['album'].unique())

106162

In [32]:
len(album['Title'].unique())

30232

In [33]:
len(pf['album_name'].unique())

17835

In [36]:
spotify_albums = set(df_spotify['album'])
album_albums = set(album['Title'])
pf_albums = set(pf['album_name'])

print('Spotify-Album Ratings', len(spotify_albums.intersection(album_albums)))
print('Spotify-PF Ratings', len(spotify_albums.intersection(pf_albums)))
print('PF Rating-Album Ratings', len(pf_albums.intersection(album_albums)))

# Why are these numbers so tiny

Spotify-Album Ratings 7187
Spotify-PF Ratings 94
PF Rating-Album Ratings 78


In [65]:
# The albums that exist in all three tables
# Some of these might not even be right since they might be by different artists
pf_albums.intersection(album_albums).intersection(spotify_albums)

{'#1',
 '10',
 '1000',
 '119',
 '11:11',
 '13',
 '18',
 '19',
 '1966',
 '1977',
 '1988',
 '1999',
 '2',
 '23',
 '25',
 '3',
 '333',
 '4',
 '5',
 '55',
 '6',
 '7',
 '77',
 '8',
 '9',
 '92982',
 'ii',
 'neo',
 'pom pom',
 'utopia'}

In [79]:
pf.head()

Unnamed: 0,reviewid,album_name,artist,pf_url,pf_score,pf_author,pf_pubdate,pf_review
0,22703,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,nate patrin,2017-01-08,"“Trip-hop” eventually became a ’90s punchline,..."
1,22721,prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,zoe camp,2017-01-07,"Eight years, five albums, and two EPs in, the ..."
2,22659,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,david glickman,2017-01-07,Minneapolis’ Uranium Club seem to revel in bei...
3,22661,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,jenn pelly,2017-01-06,Kleenex began with a crash. It transpired one ...
4,22725,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,kevin lozano,2017-01-06,It is impossible to consider a given release b...


In [81]:
pf.sort_values('pf_score', ascending = False)['album_name'][:50]

9530                               murmur [deluxe edition]
18079                                              animals
2474         it takes a nation of millions to hold us back
857                                     sign "o" the times
1399                                          astral weeks
5965                  tago mago [40th anniversary edition]
10206    otis blue: otis redding sings soul [collector'...
8634                                            stereo box
4789                                                voodoo
530                                             in concert
13174                born to run: 30th anniversary edition
9320                                       paul's boutique
1092                                          off the wall
355                                    another green world
14565      london calling: 25th anniversary legacy edition
7094                         emergency & i [vinyl reissue]
2479     the velvet underground  45th anniversary super.