In [1]:
import pandas as pd
import numpy as np
import os
import sqlite3
import re
from matplotlib import pyplot as plt
# from matplotlib_venn import venn3

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Pre-Process Data for Database
In this notebook, we pre-process the three primary tables we plan to utilize for our application:
1. Album of The Year critic ratings
- Features user and critic review aggregated by Album of the Year and Metacritic
- The dataset includes more than 30K rows of albums and aggregated critic and user review scores
- Link to source https://www.kaggle.com/kauvinlucas/30000-albums-aggregated-review-ratings

2. Spotify API
- More than 1.2M songs collected from Spotify’s API
- The dataset includes features like name, artists, album, release date, key, explicit flag, duration, popularity, danceability, and more
- The data was created in December 2020. Each row represents a single track
- Link to source https://www.kaggle.com/rodolfofigueroa/spotify-12m-songs

3. Pitchfork reviews
- Includes over 18K Pitchfork (an online music magazine) review going back to January 1999
- The database contains separate tables on artists, content, genres, labels, reviews, and years
- Link to source https://www.kaggle.com/nolanbconaway/pitchfork-data

The notebook will read in the datasets, process, and output the final datasets in csv that will ultimately be uploaded to AWS RDS.

# Read the datasets

## 1. AOTY Ratings

In [13]:
# Read
df_ratings = pd.read_csv('./album_ratings.csv') # Metacritic/AOTY

print(df_ratings.shape)
df_ratings.head(2)

(32358, 16)


Unnamed: 0,Artist,Title,Release Month,Release Day,Release Year,Format,Label,Genre,Metacritic Critic Score,Metacritic Reviews,Metacritic User Score,Metacritic User Reviews,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews
0,Neko Case,Middle Cyclone,March,3,2009,LP,ANTI-,Alt-Country,79.0,31.0,8.7,31.0,79,25,78,55
1,Jason Isbell & The 400 Unit,Jason Isbell & The 400 Unit,February,17,2009,LP,Thirty Tigers,Country Rock,70.0,14.0,8.4,7.0,73,11,73,8


In [4]:
# See data summary
df_ratings.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Release Year,32358.0,2006.66781,14.132255,1940.0,2002.0,2012.0,2017.0,2020.0
Metacritic Critic Score,13884.0,73.285148,8.399107,15.0,69.0,74.0,79.0,98.0
Metacritic Reviews,13884.0,12.483434,7.911986,4.0,6.0,10.0,17.0,51.0
Metacritic User Score,9099.0,7.78202,0.978036,1.4,7.4,8.0,8.5,9.5
Metacritic User Reviews,10574.0,63.163893,325.204608,3.0,8.0,14.0,37.0,14720.0
AOTY Critic Score,32358.0,72.808332,11.214989,4.0,67.0,74.0,80.0,100.0
AOTY Critic Reviews,32358.0,5.645003,6.179734,1.0,2.0,3.0,7.0,48.0
AOTY User Score,32358.0,71.283794,9.653036,0.0,67.0,73.0,78.0,100.0
AOTY User Reviews,32358.0,61.988782,184.221848,1.0,7.0,14.0,39.0,4530.0


## 2. Spotify API

In [14]:
# Read
df_spotify = pd.read_csv('./tracks_features.csv') #Spotify/Kaggle 1.2M tracks
print(df_spotify.shape)
df_spotify.head(2)

(1204025, 24)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02


In [6]:
# See data summary
df_spotify.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
track_number,1204025.0,7.656352,5.994977,1.0,3.0,7.0,10.0,50.0
disc_number,1204025.0,1.055906,0.295375,1.0,1.0,1.0,1.0,13.0
danceability,1204025.0,0.493057,0.189669,0.0,0.356,0.501,0.633,1.0
energy,1204025.0,0.509536,0.294684,0.0,0.252,0.524,0.766,1.0
key,1204025.0,5.194151,3.536731,0.0,2.0,5.0,8.0,11.0
loudness,1204025.0,-11.808703,6.982132,-60.0,-15.254,-9.791,-6.717,7.234
mode,1204025.0,0.671459,0.469683,0.0,0.0,1.0,1.0,1.0
speechiness,1204025.0,0.084382,0.115991,0.0,0.0351,0.0446,0.0723,0.969
acousticness,1204025.0,0.446751,0.385201,0.0,0.0376,0.389,0.861,0.996
instrumentalness,1204025.0,0.282861,0.376284,0.0,8e-06,0.00808,0.719,1.0


## 3. Pitchfork reviews

In [11]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("./database.sqlite")

# Read
query_str = \
"""
SELECT 
    t1.reviewid
    , t1.title
    , t1.artist
    , t1.url
    , t1.score
    , t1.author
    , t1.pub_date
    , t1.best_new_music
    , t2.content
    
FROM reviews t1 JOIN content t2 ON t1.reviewid = t2.reviewid
"""

df_pitchfork = pd.read_sql_query(query_str, con)
con.close()

print(df_pitchfork.shape)
df_pitchfork.head(2)

(18401, 9)


Unnamed: 0,reviewid,title,artist,url,score,author,pub_date,best_new_music,content
0,22703,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,nate patrin,2017-01-08,0,"“Trip-hop” eventually became a ’90s punchline,..."
1,22721,prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,zoe camp,2017-01-07,0,"Eight years, five albums, and two EPs in, the ..."


# Pre-process

#### All datasets: Select only the columns we need, rename fields, and create a lowercased version of the album and artist
The lowercased album and artist will help joins across the three datasets.

In [15]:
# AOTY
df_ratings = df_ratings[[
    'Artist'
    , 'Title'
    , 'Format'
    , 'Label'
    , 'Genre'
    , 'AOTY Critic Score'
    , 'AOTY Critic Reviews'
    , 'AOTY User Score'
    , 'AOTY User Reviews'  
]].copy()
df_ratings = df_ratings.rename(columns={"Title": "album_name", "Artist": "artist"})
df_ratings['album_name_lc'] = df_ratings['album_name'].str.lower()
df_ratings['artist_lc'] = df_ratings['artist'].str.lower()

# Spotify
# Select all columns for this dataset
df_spotify = df_spotify.rename(
                            columns={"id": "track_id"
                                     , "name": "track_name"
                                     , "album": "album_name"
                                     , "artists": "artist"
                                    })
df_spotify['album_name_lc'] = df_spotify['album_name'].str.lower()
df_spotify['artist_lc'] = df_spotify['artist'].str.lower()

# Pitchfork
df_pitchfork = df_pitchfork.rename(
                                columns={
                                    "title": "album_name",
                                    "url": "pf_url",
                                    "score": "pf_score",
                                    "author": "pf_author",
                                    "pub_date": "pf_pubdate",
                                    "content": "pf_review"
                                })
df_pitchfork['album_name_lc'] = df_pitchfork['album_name'].str.lower()
df_pitchfork['artist_lc'] = df_pitchfork['artist'].str.lower()

#### Spotify API: Parse out the first artist

In [17]:
# Function for cleaning Spotify artists
def getFirstArtist(s):
    '''
    This function removes the list like structure of the "artists" field in the Spotify dataset. Pick the first artist in the list
    
    Input : List of artists (str)
    Output: One artist (str)
    '''
    s = s.replace("[", "").replace("]", "")
    first_element = s.find(",")
    return s[:first_element].replace("'", "")

# Clean
df_spotify['artist'] = df_spotify['artist'].apply(lambda s:getFirstArtist(s))
df_spotify['artist_lc'] = df_spotify['artist_lc'].apply(lambda s:getFirstArtist(s))
df_spotify['artist_ids'] = df_spotify['artist_ids'].apply(lambda s:getFirstArtist(s))

df_spotify.head()

Unnamed: 0,track_id,track_name,album_name,album_id,artist,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,album_name_lc,artist_lc
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,1,1,False,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,2,1,True,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,3,1,False,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,4,1,True,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,5,1,False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine


#### Pitchfork Reviews: Clean the album names so that future joins are more likely to match across the three datasets

In [18]:
# Function for cleaning Pitchfork album names
def cleanPitchforkAlbumNames(s):
    '''
    This function sanitizes the album names in the Pitchfork dataset. The album names are rather specific, and will include details
    like "40th anniversary edition". So let's remove that
    
    Input : A dirty album name (str)
    Output: A clean album name (str)
    '''
    
    # Remove anything in brackets. Usually this will denote the edition
    bracket_check = s.find("[")
    if bracket_check != -1:
        s = s[:bracket_check].strip()
        
    # Remove anything that comes after a "n"th annivesary edition using regex
    nth_anniv_check = re.search(r'[0-9]+(th|st|nd|rd)', s)
    if nth_anniv_check:
        ix = nth_anniv_check.start()
        s = s[:ix].strip(": ")
    
    return s

# Clean
df_pitchfork['orig_album_name'] = df_pitchfork['album_name']
df_pitchfork['album_name'] = df_pitchfork['album_name'].apply(lambda s:cleanPitchforkAlbumNames(s))

# Check that PF album sanitation worked
# 'album_name' is the sanitized field
df_pitchfork.sort_values('pf_score', ascending = False)[['orig_album_name', 'album_name']][:10]

Unnamed: 0,orig_album_name,album_name
9530,murmur [deluxe edition],murmur
18079,animals,animals
2474,it takes a nation of millions to hold us back,it takes a nation of millions to hold us back
857,"sign ""o"" the times","sign ""o"" the times"
1399,astral weeks,astral weeks
5965,tago mago [40th anniversary edition],tago mago
10206,otis blue: otis redding sings soul [collector'...,otis blue: otis redding sings soul
8634,stereo box,stereo box
4789,voodoo,voodoo
530,in concert,in concert


# General EDA

## How many albums overlap between the three datasets?

In [10]:
# Subset to only albums and artists
t1 = df_ratings[['album_name_lc', 'artist_lc']].drop_duplicates()
t2 = df_spotify[['album_name_lc', 'artist_lc']].drop_duplicates()
t3 = df_pitchfork[['album_name_lc', 'artist_lc']].drop_duplicates()

# How many unique albums are there in each dataset?
# Albums with same name but different artist are counted as different
print('How many unique albums are there in each dataset?')
print('AOTY: ', len(t1))
print('Spotify: ', len(t2))
print('Pitchfork: ', len(t3))

How many unique albums are there in each dataset?
AOTY:  32315
Spotify:  187202
Pitchfork:  18350


In [11]:
# Setup data for Venn Diagram
n3 = len(t1.merge(t2, how = 'inner')) # AOTY-Spotify
n5 = len(t1.merge(t3, how = 'inner')) # AOTY-PF
n6 = len(t2.merge(t3, how = 'inner')) # Spotify-PF
n7 = len(t1.merge(t2, how = 'inner').merge(t3, how = 'inner')) # All three

n1 = len(t1) - n3 - n5 + n7
n2 = len(t2) - n3 - n6 + n7
n4 = len(t3) - n5 - n6 + n7

album_nums = [
    n1
    , n2
    , n3
    , n4
    , n5
    , n6
    , n7
]

labels = ['AOTY', 'Spotify', 'Pitchfork']

In [12]:
# Graph Venn Diagram
# venn3(subsets = album_nums, set_labels = labels, alpha = 0.5)
# plt.title('How many unique albums overlap?')
# plt.show()

## 4. Genres

In [13]:
df_genres = pd.read_csv('genres-revised.csv')
df_genres.head()

Unnamed: 0,Rock & Pop,R&B,Hip-Hop,Dance & Electronic,Country & Folk,Jazz,Other,Metal & Punk,Alternative Rock & Pop
0,Singer-Songwriter,Blue-eyed Soul,Hip Hop,Electronic,Country,Jazz,Gospel,Alternative Metal,Chillwave
1,UK Garage,R&B,Trap Rap,Dubstep,Bluegrass,Jazz-Rock,MPB,Metalcore,Ambient Pop
2,AOR,Alternative R&B,Trap,IDM,Americana,Free Improvisation,Reggae,Death Metal,Indie Pop
3,Sophisti-Pop,Neo-Soul,Grime,Deconstructed Club,American Primitivism,Jazz Fusion,Roots Reggae,Nu Metal,Electropop
4,Neo-Psychedelia,Soul,Experimental Hip-Hop,Dance Pop,Folk,Avant-Garde Jazz,Reggaeton,Blackgaze,Synthpop


#### Genres: Add Ids to Ratings

In [14]:
# Functions to transform genres file and apply to ratings
def rename_genres(genre):
    for column in df_genres:
        if genre in df_genres[column].values:
            return column

def genre_index(genre):
    if genre in df_genres.columns:
        return df_genres.columns.get_loc(genre)

# narrow genre options for the ratings data
df_ratings['Genre'] = df_ratings['Genre'].apply(lambda d: rename_genres(d))
df_ratings['genre_id'] = df_ratings['Genre'].apply(lambda d: genre_index(d))

df_ratings['genre_id'] = pd.to_numeric(df_ratings['genre_id'], downcast='integer')
df_genre_id_column = pd.DataFrame(
    {
        "genre_id": pd.Series(df_ratings['genre_id'], dtype=np.dtype("int64"))
    }
)
df_genre_id_column = df_genre_id_column.convert_dtypes()
df_ratings['genre_id'] = df_genre_id_column['genre_id']

df_ratings.head()

Unnamed: 0,artist,album_name,Format,Label,Genre,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews,album_name_lc,artist_lc,genre_id
0,Neko Case,Middle Cyclone,LP,ANTI-,Alternative Rock & Pop,79,25,78,55,middle cyclone,neko case,8
1,Jason Isbell & The 400 Unit,Jason Isbell & The 400 Unit,LP,Thirty Tigers,Country & Folk,73,11,73,8,jason isbell & the 400 unit,jason isbell & the 400 unit,4
2,Animal Collective,Merriweather Post Pavilion,LP,Domino,Alternative Rock & Pop,92,30,87,1335,merriweather post pavilion,animal collective,8
3,Bruce Springsteen,Working on a Dream,LP,Columbia Records,Rock & Pop,70,23,66,38,working on a dream,bruce springsteen,0
4,Andrew Bird,Noble Beast,LP,Fat Possum,Rock & Pop,74,24,78,44,noble beast,andrew bird,0


## 5. Labels

In [15]:
df_labels = pd.DataFrame({"Label": df_ratings['Label'].drop_duplicates()})
df_labels.head()

Unnamed: 0,Label
0,ANTI-
1,Thirty Tigers
2,Domino
3,Columbia Records
4,Fat Possum


In [16]:
def label_index(label):
    for row in df_labels['Label']:
        if row == label:
            return df_labels.index[df_labels['Label'] == label][0]

#### Labels: Add Ids to Ratings

In [17]:
df_ratings['label_id'] = df_ratings['Label'].apply(lambda d: label_index(d))

df_label_id_column = pd.DataFrame(
    {
        "label_id": pd.Series(df_ratings['label_id'], dtype=np.dtype("int64"))
    }
)
df_label_id_column = df_label_id_column.convert_dtypes()
df_ratings['label_id'] = df_label_id_column['label_id']

df_ratings.head()

Unnamed: 0,artist,album_name,Format,Label,Genre,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews,album_name_lc,artist_lc,genre_id,label_id
0,Neko Case,Middle Cyclone,LP,ANTI-,Alternative Rock & Pop,79,25,78,55,middle cyclone,neko case,8,0
1,Jason Isbell & The 400 Unit,Jason Isbell & The 400 Unit,LP,Thirty Tigers,Country & Folk,73,11,73,8,jason isbell & the 400 unit,jason isbell & the 400 unit,4,1
2,Animal Collective,Merriweather Post Pavilion,LP,Domino,Alternative Rock & Pop,92,30,87,1335,merriweather post pavilion,animal collective,8,2
3,Bruce Springsteen,Working on a Dream,LP,Columbia Records,Rock & Pop,70,23,66,38,working on a dream,bruce springsteen,0,3
4,Andrew Bird,Noble Beast,LP,Fat Possum,Rock & Pop,74,24,78,44,noble beast,andrew bird,0,4


In [18]:
df_ratings.head()

Unnamed: 0,artist,album_name,Format,Label,Genre,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews,album_name_lc,artist_lc,genre_id,label_id
0,Neko Case,Middle Cyclone,LP,ANTI-,Alternative Rock & Pop,79,25,78,55,middle cyclone,neko case,8,0
1,Jason Isbell & The 400 Unit,Jason Isbell & The 400 Unit,LP,Thirty Tigers,Country & Folk,73,11,73,8,jason isbell & the 400 unit,jason isbell & the 400 unit,4,1
2,Animal Collective,Merriweather Post Pavilion,LP,Domino,Alternative Rock & Pop,92,30,87,1335,merriweather post pavilion,animal collective,8,2
3,Bruce Springsteen,Working on a Dream,LP,Columbia Records,Rock & Pop,70,23,66,38,working on a dream,bruce springsteen,0,3
4,Andrew Bird,Noble Beast,LP,Fat Possum,Rock & Pop,74,24,78,44,noble beast,andrew bird,0,4


## Create Dataset of All But Pitchfork

In [19]:
df_merged = pd.merge(df_spotify, df_ratings, on=['artist_lc', 'album_name_lc'])
df_merged.head()

Unnamed: 0,track_id,track_name,album_name_x,album_id,artist_x,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,album_name_lc,artist_lc,artist_y,album_name_y,Format,Label,Genre,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews,genre_id,label_id
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,1,1,False,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,2,1,True,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,3,1,False,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,4,1,True,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,Rage Against The Machine,2d0hyoQ5ynDBnkvAbJKORj,5,1,False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,the battle of los angeles,rage against the machine,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21


### Create .csv Files

In [20]:
df_culled = df_merged.drop(columns=['album_name_x', 'artist_x', 'album_name_lc', 'artist_lc', 'release_date'])
df_culled = df_culled.rename(columns={
    "album_name_y": "album_name",
    "artist_y": "artist"
})
df_culled.isna().sum()

track_id                   0
track_name                 0
album_id                   0
artist_ids                 0
track_number               0
disc_number                0
explicit                   0
danceability               0
energy                     0
key                        0
loudness                   0
mode                       0
speechiness                0
acousticness               0
instrumentalness           0
liveness                   0
valence                    0
tempo                      0
duration_ms                0
time_signature             0
year                       0
artist                     0
album_name                 0
Format                     0
Label                   5287
Genre                  12543
AOTY Critic Score          0
AOTY Critic Reviews        0
AOTY User Score            0
AOTY User Reviews          0
genre_id               12543
label_id                5287
dtype: int64

In [21]:
df_culled.head()

Unnamed: 0,track_id,track_name,album_id,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,artist,album_name,Format,Label,Genre,AOTY Critic Score,AOTY Critic Reviews,AOTY User Score,AOTY User Reviews,genre_id,label_id
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,2eia0myWFgoHuttJytCxgX,2d0hyoQ5ynDBnkvAbJKORj,1,1,False,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,2eia0myWFgoHuttJytCxgX,2d0hyoQ5ynDBnkvAbJKORj,2,1,True,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,2eia0myWFgoHuttJytCxgX,2d0hyoQ5ynDBnkvAbJKORj,3,1,False,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,2eia0myWFgoHuttJytCxgX,2d0hyoQ5ynDBnkvAbJKORj,4,1,True,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,2eia0myWFgoHuttJytCxgX,2d0hyoQ5ynDBnkvAbJKORj,5,1,False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,Rage Against The Machine,The Battle of Los Angeles,LP,Epic,Metal & Punk,78,7,82,293,7,21


In [25]:
df_artists = df_culled[['artist', 'artist_ids']].drop_duplicates()
df_artists.to_csv(path_or_buf='~/code/penn/CIS550/CIS550_Final_Project/data/seed/artist.csv')

In [26]:
df_genre = df_culled[['Genre', 'genre_id']].dropna().drop_duplicates()
df_genre.to_csv(path_or_buf='~/code/penn/CIS550/CIS550_Final_Project/data/seed/genre.csv')

In [27]:
df_label = df_culled[['Label', 'label_id']].dropna().drop_duplicates()
df_label.to_csv(path_or_buf='~/code/penn/CIS550/CIS550_Final_Project/data/seed/label.csv')

In [28]:
df_album = df_culled[['album_id', 'album_name', 'artist_ids', 'genre_id', 'label_id', 'year', 'Format', 'AOTY Critic Score', 'AOTY Critic Reviews', 'AOTY User Score', 'AOTY User Reviews']].drop_duplicates()
df_album.to_csv(path_or_buf='~/code/penn/CIS550/CIS550_Final_Project/data/seed/album.csv')

In [29]:
df_songs = df_culled[['track_id', 'track_name', 'album_id', 'disc_number', 'track_number', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'explicit', 'duration_ms', 'time_signature']].drop_duplicates()
df_songs.to_csv(path_or_buf='~/code/penn/CIS550/CIS550_Final_Project/data/seed/song.csv')

In [30]:
len(df_album)

4986

# Create Reviews and Authors (from Pitchfork)

In [19]:
df_merged2 = pd.merge(df_spotify, df_pitchfork, on=['artist_lc', 'album_name_lc'], how = 'inner')
df_merged2.head()

Unnamed: 0,track_id,track_name,album_name_x,album_id,artist_x,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,album_name_lc,artist_lc,reviewid,album_name_y,artist_y,pf_url,pf_score,pf_author,pf_pubdate,best_new_music,pf_review,orig_album_name
0,6Glv3rhKQ5Lo8wBzSC4IGA,The New Year,Transatlanticism,2sfLsbSsDm780Llr9NWHQz,Death Cab for Cutie,0YrtvWJMgSdVrk3SfNjTbx,1,1,False,0.311,0.742,0,-7.844,0,0.0419,0.00386,0.00147,0.124,0.19,123.572,246373,4.0,2003,2003-10-07,transatlanticism,death cab for cutie,18656,transatlanticism,death cab for cutie,http://pitchfork.com/reviews/albums/18656-deat...,8.4,ian cohen,2013-11-06,0,The term “transatlanticism” was coined by Ben ...,transatlanticism
1,6Glv3rhKQ5Lo8wBzSC4IGA,The New Year,Transatlanticism,2sfLsbSsDm780Llr9NWHQz,Death Cab for Cutie,0YrtvWJMgSdVrk3SfNjTbx,1,1,False,0.311,0.742,0,-7.844,0,0.0419,0.00386,0.00147,0.124,0.19,123.572,246373,4.0,2003,2003-10-07,transatlanticism,death cab for cutie,2232,transatlanticism,death cab for cutie,http://pitchfork.com/reviews/albums/2232-trans...,6.4,william morris,2003-10-07,0,"[EARLY Rough Draft of Death Cab Review, 10/7/0...",transatlanticism
2,34ErnTqmuACVJ5qquF1Rsa,Lightness,Transatlanticism,2sfLsbSsDm780Llr9NWHQz,Death Cab for Cutie,0YrtvWJMgSdVrk3SfNjTbx,2,1,False,0.696,0.378,0,-12.495,1,0.0258,0.0762,0.000801,0.677,0.0852,109.795,210213,4.0,2003,2003-10-07,transatlanticism,death cab for cutie,18656,transatlanticism,death cab for cutie,http://pitchfork.com/reviews/albums/18656-deat...,8.4,ian cohen,2013-11-06,0,The term “transatlanticism” was coined by Ben ...,transatlanticism
3,34ErnTqmuACVJ5qquF1Rsa,Lightness,Transatlanticism,2sfLsbSsDm780Llr9NWHQz,Death Cab for Cutie,0YrtvWJMgSdVrk3SfNjTbx,2,1,False,0.696,0.378,0,-12.495,1,0.0258,0.0762,0.000801,0.677,0.0852,109.795,210213,4.0,2003,2003-10-07,transatlanticism,death cab for cutie,2232,transatlanticism,death cab for cutie,http://pitchfork.com/reviews/albums/2232-trans...,6.4,william morris,2003-10-07,0,"[EARLY Rough Draft of Death Cab Review, 10/7/0...",transatlanticism
4,21DVu4p5UmmdpJf3xyF4zF,Title and Registration,Transatlanticism,2sfLsbSsDm780Llr9NWHQz,Death Cab for Cutie,0YrtvWJMgSdVrk3SfNjTbx,3,1,False,0.752,0.544,9,-10.373,1,0.0314,0.343,0.0811,0.109,0.723,115.803,219267,4.0,2003,2003-10-07,transatlanticism,death cab for cutie,18656,transatlanticism,death cab for cutie,http://pitchfork.com/reviews/albums/18656-deat...,8.4,ian cohen,2013-11-06,0,The term “transatlanticism” was coined by Ben ...,transatlanticism


In [33]:
df_reviews = df_merged2[[
    'reviewid'
    , 'album_id'
    , 'pf_url'
    , 'pf_score'
    , 'best_new_music'
    , 'pf_pubdate'
    , 'pf_review'
]].drop_duplicates().reset_index(drop = True)

df_authors = df_merged2[['reviewid', 'pf_author']].drop_duplicates().reset_index(drop = True)

df_reviews.to_csv(path_or_buf='~/code/penn/CIS550/CIS550_Final_Project/data/seed/review.csv')
df_authors.to_csv(path_or_buf='~/code/penn/CIS550/CIS550_Final_Project/data/seed/author.csv')