In [22]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as sts
fuzzy_csv = '../Resources/fuzzy_df.csv'
billboard_csv = '../Resources/billboard_lyrics_1964-2015.csv'
song_data_csv = '../Resources/song_data.csv'

In [33]:
fuzzy_df = pd.read_csv(fuzzy_csv)
billboard_df = pd.read_csv(billboard_csv, encoding = 'ISO-8859-1')
song_data_df = pd.read_csv(song_data_csv)

In [34]:
fuzzy_df = fuzzy_df.rename(columns={'Unnamed: 0':'Order'})

In [35]:
fuzzy_df.head()

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Song_List_Index
0,wooly bully,only lonely,64,3209
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,3863
2,i cant get no satisfaction,i cant get no satisfaction,100,251
3,you were on my mind,youre on my mind,91,11156
4,youve lost that lovin feelin,youve got the love,70,1959


In [36]:
song_data_df.columns

Index(['song_name', 'song_popularity', 'song_duration_ms', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'audio_mode', 'speechiness', 'tempo', 'time_signature',
       'audio_valence'],
      dtype='object')

In [37]:
# adding Song List Index column to merge with the fuzzy_df
song_data_df = song_data_df.reset_index()

In [38]:
song_data_df['Song_List_Index'] = range(len(song_data_df))

In [39]:
song_data_df

Unnamed: 0,index,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Song_List_Index
0,0,Boulevard of Broken Dreams,73,262333,0.005520,0.496,0.682,0.000029,8,0.0589,-4.095,1,0.0294,167.060,4,0.474,0
1,1,In The End,66,216933,0.010300,0.542,0.853,0.000000,3,0.1080,-6.407,0,0.0498,105.256,4,0.370,1
2,2,Seven Nation Army,76,231733,0.008170,0.737,0.463,0.447000,0,0.2550,-7.828,1,0.0792,123.881,4,0.324,2
3,3,By The Way,74,216933,0.026400,0.451,0.970,0.003550,0,0.1020,-4.938,1,0.1070,122.444,4,0.198,3
4,4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.000000,10,0.1130,-5.065,1,0.0313,172.011,4,0.574,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18830,18830,Let It Breathe,60,159645,0.893000,0.500,0.151,0.000065,11,0.1110,-16.107,1,0.0348,113.969,4,0.300,18830
18831,18831,Answers,60,205666,0.765000,0.495,0.161,0.000001,11,0.1050,-14.078,0,0.0301,94.286,4,0.265,18831
18832,18832,Sudden Love (Acoustic),23,182211,0.847000,0.719,0.325,0.000000,0,0.1250,-12.222,1,0.0355,130.534,4,0.286,18832
18833,18833,Gentle on My Mind,55,352280,0.945000,0.488,0.326,0.015700,3,0.1190,-12.020,1,0.0328,106.063,4,0.323,18833


In [40]:
song_data_df = song_data_df.drop(columns={'song_popularity','song_duration_ms'})

In [41]:
song_data_df.head()

Unnamed: 0,index,song_name,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Song_List_Index
0,0,Boulevard of Broken Dreams,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474,0
1,1,In The End,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37,1
2,2,Seven Nation Army,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324,2
3,3,By The Way,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198,3
4,4,How You Remind Me,0.000954,0.447,0.766,0.0,10,0.113,-5.065,1,0.0313,172.011,4,0.574,4


In [42]:
# merging
matched_df = pd.merge(fuzzy_df,song_data_df,on='Song_List_Index')

In [43]:
matched_df

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Song_List_Index,index,song_name,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,wooly bully,only lonely,64,3209,3209,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
1,youre only lonely,only lonely,79,3209,3209,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
2,only the lonely,only lonely,85,3209,3209,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
3,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,3863,3863,"I Can't Help Myself (Sugar Pie, Honey Bunch)",0.245000,0.667,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.9710
4,i cant get no satisfaction,i cant get no satisfaction,100,251,251,(I Can't Get No) Satisfaction,0.000380,0.735,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.8590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,here,where,89,7203,7203,where,0.760000,0.767,0.330,0.758000,5,0.5970,-12.559,0,0.0429,121.901,3,0.1950
5096,waves,waves,100,7346,7346,Waves,0.039100,0.571,0.573,0.000009,10,0.3310,-4.533,1,0.0383,96.122,4,0.0542
5097,el perdon,el perdón,89,10269,10269,El Perdón,0.446000,0.628,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.6620
5098,night changes,dont change,75,746,746,Don't Change,0.000059,0.206,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.1550


In [26]:
matched_df = matched_df.sort_values('Unnamed: 0').reset_index(inplace=False, drop=False)
matched_df

Unnamed: 0.1,level_0,Unnamed: 0,Song Name,Matched Name,Ratio Value,Song List Index,index,song_name,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,0,0,wooly bully,only lonely,64,3209,3209,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.214
1,3,1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,3863,3863,"I Can't Help Myself (Sugar Pie, Honey Bunch)",0.245000,0.667,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971
2,4,2,i cant get no satisfaction,i cant get no satisfaction,100,251,251,(I Can't Get No) Satisfaction,0.000380,0.735,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.859
3,6,3,you were on my mind,youre on my mind,91,11156,11156,You're on My Mind,0.814000,0.651,0.128,0.908000,2,0.0596,-14.998,1,0.0511,160.001,4,0.238
4,9,4,youve lost that lovin feelin,youve got the love,70,1959,1959,You've Got The Love,0.004070,0.571,0.708,0.000000,6,0.1250,-4.740,1,0.0329,109.899,4,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,5097,5095,el perdon,el perdón,89,10269,10269,El Perdón,0.446000,0.628,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.662
5096,2265,5096,she knows,heaven knows,76,10584,10584,Heaven Knows,0.035100,0.750,0.564,0.000069,9,0.1120,-7.956,0,0.0462,96.969,4,0.264
5097,5098,5097,night changes,dont change,75,746,746,Don't Change,0.000059,0.206,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.155
5098,5099,5098,back to back,back to back,100,3684,3684,Back To Back,0.008980,0.635,0.472,0.000000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373


In [46]:
matched_df.rename(columns={'Unnamed: 0':'order'}).drop(columns='level_0')

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Song_List_Index,index,song_name,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,wooly bully,only lonely,64,3209,3209,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
1,youre only lonely,only lonely,79,3209,3209,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
2,only the lonely,only lonely,85,3209,3209,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
3,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,3863,3863,"I Can't Help Myself (Sugar Pie, Honey Bunch)",0.245000,0.667,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.9710
4,i cant get no satisfaction,i cant get no satisfaction,100,251,251,(I Can't Get No) Satisfaction,0.000380,0.735,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.8590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,here,where,89,7203,7203,where,0.760000,0.767,0.330,0.758000,5,0.5970,-12.559,0,0.0429,121.901,3,0.1950
5096,waves,waves,100,7346,7346,Waves,0.039100,0.571,0.573,0.000009,10,0.3310,-4.533,1,0.0383,96.122,4,0.0542
5097,el perdon,el perdón,89,10269,10269,El Perdón,0.446000,0.628,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.6620
5098,night changes,dont change,75,746,746,Don't Change,0.000059,0.206,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.1550


In [47]:
matched_df.to_csv('../Resources/matched_df.csv',index=False)

In [48]:
matched_df = pd.read_csv('../Resources/matched_df.csv')

In [55]:
matched_df = matched_df.drop(columns=['Song_List_Index','index'])
matched_df

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,song_name,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,wooly bully,only lonely,64,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
1,youre only lonely,only lonely,79,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
2,only the lonely,only lonely,85,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140
3,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,"I Can't Help Myself (Sugar Pie, Honey Bunch)",0.245000,0.667,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.9710
4,i cant get no satisfaction,i cant get no satisfaction,100,(I Can't Get No) Satisfaction,0.000380,0.735,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.8590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,here,where,89,where,0.760000,0.767,0.330,0.758000,5,0.5970,-12.559,0,0.0429,121.901,3,0.1950
5096,waves,waves,100,Waves,0.039100,0.571,0.573,0.000009,10,0.3310,-4.533,1,0.0383,96.122,4,0.0542
5097,el perdon,el perdón,89,El Perdón,0.446000,0.628,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.6620
5098,night changes,dont change,75,Don't Change,0.000059,0.206,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.1550


In [56]:
# Import billboard_df to merge with high_ratio df

billboard_df = pd.read_csv('../Resources/billboard_df.csv')
billboard_df

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source,fuzzy_name,fuzzy_artist
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0,wooly bully,sam the sham and the pharaohs
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0,i cant help myself sugar pie honey bunch,four tops
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0,i cant get no satisfaction,the rolling stones
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0,you were on my mind,we five
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0,youve lost that lovin feelin,the righteous brothers
...,...,...,...,...,...,...,...,...
5095,96,el perdon,nicky jam and enrique iglesias,2015,enrique iglesias dime si es verdad me dijeron ...,3.0,el perdon,nicky jam and enrique iglesias
5096,97,she knows,neyo featuring juicy j,2015,,,she knows,neyo featuring juicy j
5097,98,night changes,one direction,2015,going out tonight changes into something red ...,1.0,night changes,one direction
5098,99,back to back,drake,2015,oh man oh man oh man not againyeah i learned ...,1.0,back to back,drake


In [57]:
# Drop unnecessary columns before merging, only want the rank, year, and name (for matching)

billboard_df = billboard_df.drop(columns=['Song','Artist','Lyrics','Source','fuzzy_artist'])
billboard_df = billboard_df.rename(columns={'fuzzy_name':'Billboard Song Name'})
billboard_df

Unnamed: 0,Rank,Year,Billboard Song Name
0,1,1965,wooly bully
1,2,1965,i cant help myself sugar pie honey bunch
2,3,1965,i cant get no satisfaction
3,4,1965,you were on my mind
4,5,1965,youve lost that lovin feelin
...,...,...,...
5095,96,2015,el perdon
5096,97,2015,she knows
5097,98,2015,night changes
5098,99,2015,back to back


In [58]:
# Join dataframes on index

composite_df = matched_df.join(billboard_df,how='inner')

In [59]:
composite_df

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,song_name,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Rank,Year,Billboard Song Name
0,wooly bully,only lonely,64,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140,1,1965,wooly bully
1,youre only lonely,only lonely,79,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140,2,1965,i cant help myself sugar pie honey bunch
2,only the lonely,only lonely,85,Only Lonely,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.2140,3,1965,i cant get no satisfaction
3,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,"I Can't Help Myself (Sugar Pie, Honey Bunch)",0.245000,0.667,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.9710,4,1965,you were on my mind
4,i cant get no satisfaction,i cant get no satisfaction,100,(I Can't Get No) Satisfaction,0.000380,0.735,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.8590,5,1965,youve lost that lovin feelin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,here,where,89,where,0.760000,0.767,0.330,0.758000,5,0.5970,-12.559,0,0.0429,121.901,3,0.1950,96,2015,el perdon
5096,waves,waves,100,Waves,0.039100,0.571,0.573,0.000009,10,0.3310,-4.533,1,0.0383,96.122,4,0.0542,97,2015,she knows
5097,el perdon,el perdón,89,El Perdón,0.446000,0.628,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.6620,98,2015,night changes
5098,night changes,dont change,75,Don't Change,0.000059,0.206,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.1550,99,2015,back to back


In [60]:
composite_df.to_csv('../Resources/composite_df.csv',index=False)