In [62]:
#Import Dependencies & file paths

import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
billboard_csv = '../Resources/billboard_lyrics_1964-2015.csv'
song_list_csv = '../Resources/song_info.csv'
song_data_csv = '../Resources/song_data.csv'

In [63]:
# Create dataframes from csv files - Billboard top 100 songs - 1964 through 2015
billboard_df = pd.read_csv(billboard_csv, encoding = 'ISO-8859-1')
billboard_df.head()

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0


In [64]:
# Create dataframes from csv files - 19,000 Spotify songs with album

song_df = pd.read_csv(song_list_csv)
song_df.head()

Unnamed: 0,song_name,artist_name,album_names,playlist
0,Boulevard of Broken Dreams,Green Day,Greatest Hits: God's Favorite Band,00s Rock Anthems
1,In The End,Linkin Park,Hybrid Theory,00s Rock Anthems
2,Seven Nation Army,The White Stripes,Elephant,00s Rock Anthems
3,By The Way,Red Hot Chili Peppers,By The Way (Deluxe Version),00s Rock Anthems
4,How You Remind Me,Nickelback,Silver Side Up,00s Rock Anthems


In [65]:
# Create dataframes from csv files - 19,000 Spotify songs with data

spotify_data_df = pd.read_csv(song_data_csv, encoding = 'ISO-8859-1')
spotify_data_df.head()

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,Boulevard of Broken Dreams,73,262333,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474
1,In The End,66,216933,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37
2,Seven Nation Army,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324
3,By The Way,74,216933,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.0,10,0.113,-5.065,1,0.0313,172.011,4,0.574


In [66]:
# Add columns to both dataframes with lowercase song names for better matching

billboard_df['fuzzy_name'] = billboard_df['Song'].apply(lambda x: x.lower())
song_df['fuzzy_name'] = song_df['song_name'].apply(lambda x: x.lower())
billboard_df['fuzzy_artist'] = billboard_df['Artist'].apply(lambda x: x.lower())
song_df['fuzzy_artist'] = song_df['artist_name'].apply(lambda x: x.lower())

# Checks ->
# billboard_df.head()
# song_df.head()

In [67]:
# Remove punctuation from lowercase song names to help with better matching

trans_table = str.maketrans('','',string.punctuation)
billboard_df['fuzzy_name'] = billboard_df['fuzzy_name'].apply(lambda x: x.translate(trans_table))
song_df['fuzzy_name'] = song_df['fuzzy_name'].apply(lambda x: x.translate(trans_table))
billboard_df['fuzzy_artist'] = billboard_df['fuzzy_artist'].apply(lambda x: x.translate(trans_table))
song_df['fuzzy_artist'] = song_df['fuzzy_artist'].apply(lambda x: x.translate(trans_table))

# Checks ->
# billboard_df.head()
# song_df.head()

In [68]:
# Export song_df to csv file for compiling notebook

song_df.to_csv('../Resources/song_df.csv',index=False)

In [69]:
# Export billboard_df as a separate csv file to be used in compiling notebook

billboard_df.to_csv('../Resources/billboard_df.csv',index=False)

In [70]:
# Testing fuzzywuzzy on a small sample size to see how it works

query = billboard_df['fuzzy_name'][2]
choices = song_df['fuzzy_name']

# ExtractOne only yields top result

result = process.extractOne(query,choices)
result

('i cant get no satisfaction', 100, 251)

In [71]:
# Show top 5 matches for same song, to look at ratio scores

results = process.extract(query,choices)
results

[('i cant get no satisfaction', 100, 251),
 ('i can', 90, 3751),
 ('satisfaction', 90, 5709),
 ('i can', 90, 9356),
 ('cant stop', 86, 13)]

In [72]:
# Double check index provided by fuzzywuzzy to ensure data integrity

song_df.loc[251]

song_name            (I Can't Get No) Satisfaction
artist_name                     The Rolling Stones
album_names     Hot Rocks (1964-1971) [Remastered]
playlist                          60s Rock Anthems
fuzzy_name              i cant get no satisfaction
fuzzy_artist                    the rolling stones
Name: 251, dtype: object

In [11]:
# Loop through Billboard dataframe and fuzzy match to list of spotify songs

# Initalize lists to hold query songs as well as the fuzzy mathced songs

query_name = []
fuzzy_list = []
query_artist = []

# Set up 'choices' parameter of fuzzywuzzy to tell the module what songs to look through to match our query

choices = song_df['fuzzy_name']

# For loop to loop through billboard songs and fuzzmatch to spotify list

for i in range(len(billboard_df)):
    
    query = billboard_df['fuzzy_name'][i]
    
    query_name.append(billboard_df['fuzzy_name'][i])
    query_artist.append(billboard_df['fuzzy_artist'][i])
    
    # Simple ratio checks for the similarty of the two strings based on some crazy math shit
#     if (process.extractOne(query,choices,scorer=fuzz.ratio)) == 100:
#         fuzzy_list.append(process.extractOne(query,choices,scorer=fuzz.ratio))
        
    # Partial ration looks at the similarty of the strings from the beginning, ie "Every Time We Touch" = "Every Time"
    #elif fuzzy_list(process.extractOne(query,choices,scorer=fuzz.partial_ratio)) == 100:
        #fuzzy_list.append(process.extractOne(query,choices,scorer=fuzz.partial_ratio))
        
    # Token sort ratio gives 100 if every word is same, irrespective of the position ie Dog Butt = Butt Dog
    #elif fuzzy_list(process.extractOne(query,choices,scorer=fuzz.token_sort_ratio)) == 100:
        #fuzzy_list.append(process.extractOne(query,choices,scorer=fuzz.token_sort_ratio))
        
    # If no matches are perfect above, just take the closest it can find
    #else:
        #fuzzy_list.append(process.extractOne(query,choices,scorer=fuzz.ratio))
    fuzzy_list.append(process.extractOne(query,choices,scorer=fuzz.ratio))


In [73]:
# Check results of same song as above to ensure similar result following the loop

fuzzy_list

[('only lonely', 64, 3209),
 ('i cant help myself sugar pie honey bunch', 100, 3863),
 ('i cant get no satisfaction', 100, 251),
 ('youre on my mind', 91, 11156),
 ('youve got the love', 70, 1959),
 ('london town', 74, 13871),
 ('help', 100, 14944),
 ('you are my heart', 70, 6881),
 ('crying in the club', 84, 4681),
 ('my girl', 100, 2045),
 ('noche de ronda', 71, 1348),
 ('on the road', 74, 1831),
 ('the hand that feeds', 68, 12914),
 ('then he kissed me', 62, 6903),
 ('shotgun', 100, 1161),
 ('i want you back', 69, 1175),
 ('this mountain', 67, 4779),
 ('the high road', 72, 16698),
 ('where do you go to my lovely', 58, 2112),
 ('stop in the name of love', 100, 6867),
 ('unchained melody', 100, 2063),
 ('silhouettes', 100, 5847),
 ('ill be there for you', 65, 17270),
 ('caravan', 67, 492),
 ('mr tambourine man', 100, 324),
 ('set fire to the rain', 65, 1372),
 ('es mi reina', 70, 3920),
 ('whats the use', 65, 11574),
 ('self destruction', 82, 4097),
 ('hang loose', 67, 3442),
 ('side 

In [74]:
# Because fuzzy_list is a list of tuples, we need to iterate through the list to parse out the tuples into seperate lists

matched_name = []
ratio_value = []
song_index = []

for i in range(len(fuzzy_list)):
    
    matched_name.append(fuzzy_list[i][0])
    ratio_value.append(fuzzy_list[i][1])
    song_index.append(fuzzy_list[i][2])


In [75]:
# Create a seperate dictionary to hold all results

fuzzy_dict = {'Song_Name':query_name,
             'Matched_Name':matched_name,
              'Artist':query_artist,
             'Ratio_Value':ratio_value,
             'Song_List_Index':song_index}

# Create dataframe from dictionary 

fuzzy_df = pd.DataFrame(fuzzy_dict)

In [76]:
# Preview dataframe of resuls

fuzzy_df

Unnamed: 0,Song_Name,Matched_Name,Artist,Ratio_Value,Song_List_Index
0,wooly bully,only lonely,sam the sham and the pharaohs,64,3209
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,four tops,100,3863
2,i cant get no satisfaction,i cant get no satisfaction,the rolling stones,100,251
3,you were on my mind,youre on my mind,we five,91,11156
4,youve lost that lovin feelin,youve got the love,the righteous brothers,70,1959
...,...,...,...,...,...
5095,el perdon,el perdón,nicky jam and enrique iglesias,89,10269
5096,she knows,heaven knows,neyo featuring juicy j,76,10584
5097,night changes,dont change,one direction,75,746
5098,back to back,back to back,drake,100,3684


In [77]:
# Export dataframe for use in merging and cleaning notebook

fuzzy_df.to_csv('../Resources/fuzzy_df.csv',index=False)
# Go to merging_cleaning notebook from here

In [78]:
# The rest of this takes place after the merging_cleaning notebook is complete
# Import composite_df that has all merges and joins

composite_df = pd.read_csv('../Resources/composite_df.csv')
composite_df

Unnamed: 0,Song_Name,Matched_Name,Artist,Matched_Artist,Ratio_Value,Year,Rank,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,wooly bully,only lonely,sam the sham and the pharaohs,the ballroom thieves,64,1965,1,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.214
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,four tops,four tops,100,1965,2,0.245000,0.667,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971
2,i cant get no satisfaction,i cant get no satisfaction,the rolling stones,the rolling stones,100,1965,3,0.000380,0.735,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.859
3,you were on my mind,youre on my mind,we five,nokiaa,91,1965,4,0.814000,0.651,0.128,0.908000,2,0.0596,-14.998,1,0.0511,160.001,4,0.238
4,youve lost that lovin feelin,youve got the love,the righteous brothers,florence the machine,70,1965,5,0.004070,0.571,0.708,0.000000,6,0.1250,-4.740,1,0.0329,109.899,4,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,el perdon,el perdón,nicky jam and enrique iglesias,nicky jam,89,2015,96,0.446000,0.628,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.662
5096,she knows,heaven knows,neyo featuring juicy j,gina chavez,76,2015,97,0.035100,0.750,0.564,0.000069,9,0.1120,-7.956,0,0.0462,96.969,4,0.264
5097,night changes,dont change,one direction,inxs,75,2015,98,0.000059,0.206,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.155
5098,back to back,back to back,drake,drake,100,2015,99,0.008980,0.635,0.472,0.000000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373


In [79]:
# Perform fuzzy matching on Artist and Matched_Artist to see if the song matching was correct
# Loop through composite dataframe and fuzzy match each row's Artist and Matched_Artist
# Initalize lists to hold ratio values for normal and partial ratios

fuzzy_ratio = []
fuzzy_partial = []

# For loop to loop through billboard songs and fuzzmatch to spotify list

for i in range(len(composite_df)):
    
    artist = composite_df['Artist'][i]
    matched_artist = composite_df['Matched_Artist'][i]
    fuzzy_ratio.append(fuzz.ratio(artist,matched_artist))
    fuzzy_partial.append(fuzz.partial_ratio(artist,matched_artist))

In [80]:
# List checks
# fuzzy_ratio
# fuzzy_partial

# Add values to dataframe

composite_df['Artist_Ratio'] = fuzzy_ratio
composite_df['Artist_Partial'] = fuzzy_partial
composite_df

Unnamed: 0,Song_Name,Matched_Name,Artist,Matched_Artist,Ratio_Value,Year,Rank,acousticness,danceability,energy,...,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Artist_Ratio,Artist_Partial
0,wooly bully,only lonely,sam the sham and the pharaohs,the ballroom thieves,64,1965,1,0.898000,0.412,0.276,...,7,0.0955,-11.711,1,0.0296,90.781,4,0.214,45,50
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,four tops,four tops,100,1965,2,0.245000,0.667,0.599,...,0,0.1070,-8.894,1,0.0291,127.935,4,0.971,100,100
2,i cant get no satisfaction,i cant get no satisfaction,the rolling stones,the rolling stones,100,1965,3,0.000380,0.735,0.820,...,2,0.0983,-8.554,1,0.0494,135.586,4,0.859,100,100
3,you were on my mind,youre on my mind,we five,nokiaa,91,1965,4,0.814000,0.651,0.128,...,2,0.0596,-14.998,1,0.0511,160.001,4,0.238,15,17
4,youve lost that lovin feelin,youve got the love,the righteous brothers,florence the machine,70,1965,5,0.004070,0.571,0.708,...,6,0.1250,-4.740,1,0.0329,109.899,4,0.398,33,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,el perdon,el perdón,nicky jam and enrique iglesias,nicky jam,89,2015,96,0.446000,0.628,0.690,...,8,0.1120,-5.828,1,0.0511,179.911,4,0.662,46,100
5096,she knows,heaven knows,neyo featuring juicy j,gina chavez,76,2015,97,0.035100,0.750,0.564,...,9,0.1120,-7.956,0,0.0462,96.969,4,0.264,24,36
5097,night changes,dont change,one direction,inxs,75,2015,98,0.000059,0.206,0.810,...,9,0.0855,-6.492,1,0.0423,163.552,4,0.155,24,50
5098,back to back,back to back,drake,drake,100,2015,99,0.008980,0.635,0.472,...,8,0.1170,-3.869,0,0.5110,174.008,4,0.373,100,100


In [81]:
composite_df = composite_df[['Song_Name','Matched_Name','Ratio_Value','Artist','Matched_Artist','Artist_Ratio','Artist_Partial',
                             'Year','Rank','acousticness','danceability','energy','instrumentalness','key',
                             'liveness','loudness','audio_mode','speechiness','tempo','time_signature','audio_valence']]
composite_df

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,wooly bully,only lonely,64,sam the sham and the pharaohs,the ballroom thieves,45,50,1965,1,0.898000,...,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.214
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.245000,...,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971
2,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.000380,...,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.859
3,you were on my mind,youre on my mind,91,we five,nokiaa,15,17,1965,4,0.814000,...,0.128,0.908000,2,0.0596,-14.998,1,0.0511,160.001,4,0.238
4,youve lost that lovin feelin,youve got the love,70,the righteous brothers,florence the machine,33,33,1965,5,0.004070,...,0.708,0.000000,6,0.1250,-4.740,1,0.0329,109.899,4,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,el perdon,el perdón,89,nicky jam and enrique iglesias,nicky jam,46,100,2015,96,0.446000,...,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.662
5096,she knows,heaven knows,76,neyo featuring juicy j,gina chavez,24,36,2015,97,0.035100,...,0.564,0.000069,9,0.1120,-7.956,0,0.0462,96.969,4,0.264
5097,night changes,dont change,75,one direction,inxs,24,50,2015,98,0.000059,...,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.155
5098,back to back,back to back,100,drake,drake,100,100,2015,99,0.008980,...,0.472,0.000000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373


In [82]:
# Filter out songs based on a ratio limit

ratio_limit = 80
high_ratio_songs = composite_df[composite_df['Ratio_Value']>ratio_limit].reset_index(drop=True)

In [83]:
high_ratio_songs

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.24500,...,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.9710
1,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.00038,...,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.8590
2,you were on my mind,youre on my mind,91,we five,nokiaa,15,17,1965,4,0.81400,...,0.128,0.908000,2,0.0596,-14.998,1,0.0511,160.001,4,0.2380
3,help,help,100,the beatles,papa roach,19,20,1965,7,0.00085,...,0.900,0.000002,7,0.2710,-5.420,1,0.1340,183.960,4,0.2860
4,crying in the chapel,crying in the club,84,elvis presley,camila cabello,30,31,1965,9,0.33400,...,0.617,0.000000,6,0.1630,-5.834,0,0.3830,85.227,4,0.6030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2252,here,where,89,alessia cara,matt mcwaters,40,42,2015,94,0.76000,...,0.330,0.758000,5,0.5970,-12.559,0,0.0429,121.901,3,0.1950
2253,waves,waves,100,mr probz,kanye west,11,12,2015,95,0.03910,...,0.573,0.000009,10,0.3310,-4.533,1,0.0383,96.122,4,0.0542
2254,el perdon,el perdón,89,nicky jam and enrique iglesias,nicky jam,46,100,2015,96,0.44600,...,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.6620
2255,back to back,back to back,100,drake,drake,100,100,2015,99,0.00898,...,0.472,0.000000,8,0.1170,-3.869,0,0.5110,174.008,4,0.3730


In [84]:
# Create boolean column for whether the artist ratios deem a song match good or not

artist_limit = 74
artist_ratio_boolean = []
for i in range(len(high_ratio_songs)):
    if high_ratio_songs['Artist_Ratio'][i] > 49 or high_ratio_songs['Artist_Partial'][i] > 49:
        artist_ratio_boolean.append(True)
    else:
        artist_ratio_boolean.append(False)
high_ratio_songs['Artist_Ratio_Boolean'] = artist_ratio_boolean

In [85]:
high_ratio_songs.head()

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Artist_Ratio_Boolean
0,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.245,...,0.0,0,0.107,-8.894,1,0.0291,127.935,4,0.971,True
1,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.00038,...,0.0572,2,0.0983,-8.554,1,0.0494,135.586,4,0.859,True
2,you were on my mind,youre on my mind,91,we five,nokiaa,15,17,1965,4,0.814,...,0.908,2,0.0596,-14.998,1,0.0511,160.001,4,0.238,False
3,help,help,100,the beatles,papa roach,19,20,1965,7,0.00085,...,2e-06,7,0.271,-5.42,1,0.134,183.96,4,0.286,False
4,crying in the chapel,crying in the club,84,elvis presley,camila cabello,30,31,1965,9,0.334,...,0.0,6,0.163,-5.834,0,0.383,85.227,4,0.603,False


In [104]:
# Filter again, only keeping songs that both met our ratio limit as well as artist ratio limits

final_summary = high_ratio_songs[high_ratio_songs['Artist_Ratio_Boolean'] == True].reset_index(drop=True)
final_summary

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Artist_Ratio_Boolean
0,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.24500,...,0.00000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971,True
1,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.00038,...,0.05720,2,0.0983,-8.554,1,0.0494,135.586,4,0.859,True
2,my girl,my girl,100,the temptations,the temptations,100,100,1965,10,0.54500,...,0.00000,0,0.5590,-5.714,1,0.0691,104.451,4,0.598,True
3,stop in the name of love,stop in the name of love,100,the supremes,the supremes,100,100,1965,20,0.79900,...,0.00000,0,0.1150,-5.832,1,0.0287,115.884,4,0.504,True
4,unchained melody,unchained melody,100,the righteous brothers,the righteous brothers,100,100,1965,21,0.51700,...,0.00000,0,0.6180,-16.652,1,0.0288,98.742,3,0.266,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,chandelier,chandelier,100,sia,kina grannis,27,67,2015,90,0.83500,...,0.00000,5,0.1140,-13.578,0,0.0320,75.874,4,0.127,True
1109,heartbeat song,heartbeat song,100,kelly clarkson,kelly clarkson,100,100,2015,91,0.01200,...,0.00000,6,0.0645,-3.711,1,0.0505,149.093,4,0.444,True
1110,el perdon,el perdón,89,nicky jam and enrique iglesias,nicky jam,46,100,2015,96,0.44600,...,0.00000,8,0.1120,-5.828,1,0.0511,179.911,4,0.662,True
1111,back to back,back to back,100,drake,drake,100,100,2015,99,0.00898,...,0.00000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373,True


In [99]:
from sqlalchemy import create_engine
rds_connection_string = "postgres:postgres@localhost:5432/Billboard_Songs"
engine = create_engine(f'postgresql://{rds_connection_string}')
engine.table_names()

['spotify_song_data',
 'final_summary',
 'billboard',
 'spotify_songs',
 'fuzzy_key']

In [105]:
# Change all df columns to lowercase so they can properly import into SQL

billboard_df.columns = [x.lower() for x in billboard_df.columns]
song_df.columns = [x.lower() for x in song_df.columns]
spotify_data_df.columns = [x.lower() for x in spotify_data_df.columns]
fuzzy_df.columns = [x.lower() for x in fuzzy_df.columns]
final_summary.columns = [x.lower() for x in final_summary.columns]

In [97]:
# Print column names to ensure they are correct in SQL tables

print(f'billboard df columns {billboard_df.columns}')
print(f'song df columns {song_df.columns}')
print(f'spotify data df columns {spotify_data_df.columns}')
print(f'fuzzy df columns {fuzzy_df.columns}')
print(f'final summary df columns {final_summary.columns}')

billboard df columns Index(['rank', 'song', 'artist', 'year', 'lyrics', 'source', 'fuzzy_name',
       'fuzzy_artist'],
      dtype='object')
song df columns Index(['song_name', 'artist_name', 'album_names', 'playlist', 'fuzzy_name',
       'fuzzy_artist'],
      dtype='object')
spotify data df columns Index(['song_name', 'song_popularity', 'song_duration_ms', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'audio_mode', 'speechiness', 'tempo', 'time_signature',
       'audio_valence'],
      dtype='object')
fuzzy df columns Index(['song_name', 'matched_name', 'artist', 'ratio_value',
       'song_list_index'],
      dtype='object')
final summary df columns Index(['song_name', 'matched_name', 'ratio_value', 'artist', 'matched_artist',
       'artist_ratio', 'artist_partial', 'year', 'rank', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'audio_mode', 'speechiness', 'te

In [107]:
# Import dataframes into SQL database

billboard_df.to_sql(name='billboard', con=engine, if_exists='append', index=True)
song_df.to_sql(name='spotify_songs', con=engine, if_exists='append', index=True)
spotify_data_df.to_sql(name='spotify_song_data', con=engine, if_exists='append', index=True)
fuzzy_df.to_sql(name='fuzzy_key', con=engine, if_exists='append', index=True)
final_summary.to_sql(name='final_summary', con=engine, if_exists='append', index=True)

In [109]:
# Connect to and query from database

conn = engine.connect()

pd.read_sql('Select * from final_summary where ratio_value = 100',conn)

Unnamed: 0,index,song_name,matched_name,ratio_value,artist,matched_artist,artist_ratio,artist_partial,year,rank,...,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,artist_ratio_boolean
0,0,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,...,0.00000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971,true
1,1,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,...,0.05720,2,0.0983,-8.554,1,0.0494,135.586,4,0.859,true
2,2,my girl,my girl,100,the temptations,the temptations,100,100,1965,10,...,0.00000,0,0.5590,-5.714,1,0.0691,104.451,4,0.598,true
3,3,stop in the name of love,stop in the name of love,100,the supremes,the supremes,100,100,1965,20,...,0.00000,0,0.1150,-5.832,1,0.0287,115.884,4,0.504,true
4,4,unchained melody,unchained melody,100,the righteous brothers,the righteous brothers,100,100,1965,21,...,0.00000,0,0.6180,-16.652,1,0.0288,98.742,3,0.266,true
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,1107,budapest,budapest,100,george ezra,george ezra,100,100,2015,89,...,0.00000,5,0.1100,-8.303,1,0.0276,127.810,4,0.389,true
1014,1108,chandelier,chandelier,100,sia,kina grannis,27,67,2015,90,...,0.00000,5,0.1140,-13.578,0,0.0320,75.874,4,0.127,true
1015,1109,heartbeat song,heartbeat song,100,kelly clarkson,kelly clarkson,100,100,2015,91,...,0.00000,6,0.0645,-3.711,1,0.0505,149.093,4,0.444,true
1016,1111,back to back,back to back,100,drake,drake,100,100,2015,99,...,0.00000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373,true


In [111]:
engine.execute('Select * from final_summary where ratio_value = 100').fetchall()

[(0, 'i cant help myself sugar pie honey bunch', 'i cant help myself sugar pie honey bunch', 100, 'four tops', 'four tops', 100, 100, '1965', 2, Decimal('0.245'), Decimal('0.667'), Decimal('0.599'), Decimal('0.0'), 0, Decimal('0.107'), Decimal('-8.894'), 1, Decimal('0.0291'), Decimal('127.935'), 4, Decimal('0.971'), 'true'),
 (1, 'i cant get no satisfaction', 'i cant get no satisfaction', 100, 'the rolling stones', 'the rolling stones', 100, 100, '1965', 3, Decimal('0.00038'), Decimal('0.735'), Decimal('0.82'), Decimal('0.0572'), 2, Decimal('0.0983'), Decimal('-8.554'), 1, Decimal('0.0494'), Decimal('135.586'), 4, Decimal('0.8590000000000001'), 'true'),
 (2, 'my girl', 'my girl', 100, 'the temptations', 'the temptations', 100, 100, '1965', 10, Decimal('0.545'), Decimal('0.6'), Decimal('0.546'), Decimal('0.0'), 0, Decimal('0.5589999999999999'), Decimal('-5.714'), 1, Decimal('0.0691'), Decimal('104.451'), 4, Decimal('0.598'), 'true'),
 (3, 'stop in the name of love', 'stop in the name of

In [None]:
### the rest probably needs to be deleted, a queries.sql file was saved with the table creation and checks

In [42]:
# Import MySQL dependencies

from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
import psycopg2

In [44]:
engine=psycopg2.connect(f'postgresql://postgres:postgres@localhost:5432')
engine.autocommit=True
cursor=engine.cursor()
sql='''Create database test'''
cursor.execute(sql)

In [45]:
# Define our billboard, spotify_songs, spotify _song_data, fuzzy_key, and final_summary tables

class billboard(Base):
    __tablename__ = 'billboard'
    id = Column(Integer, primary_key=True)
    Rank=Column(Integer)
    Song=Column(String)
    Artist=Column(String)
    Lyrics=Column(String)
    Source=Column(String)
    
class spotify_songs(Base):
    __tablename__ = 'spotify_songs'
    id = Column(Integer, primary_key=True)
    song_name=Column(String)
    artist_name=Column(String)
    album_names=Column(String)
    playlist=Column(String) 
    
class spotify_song_data(Base):
    __tablename__ = 'spotify_song_data'
    id = Column(Integer, primary_key=True)
    song_name=Column(String)
    song_popularity=Column(String)
    song_duration_ms=Column(String)
    acousticness=Column(Integer) 
    danceability=Column(Integer)
    energy=Column(Integer)
    instrumentalness=Column(Integer)
    key=Column(Integer)
    liveness=Column(Integer)
    loudness=Column(Integer)
    audio_mode=Column(Integer)
    speechiness=Column(Integer)
    tempo=Column(Integer)
    time_signature=Column(Integer)
    audio_valence=Column(Integer)
    
    
class fuzzy_key(Base):
    __tablename__ = 'fuzzy_key'
    id = Column(Integer, primary_key=True)
    Song_Name=Column(String)
    Matched_Name=Column(String) 
    Ratio_Value=Column(Integer)    
    Song_List_Index=Column(Integer)
    
class final_summary(Base):
    __tablename__ = 'final_summary'
    id = Column(Integer, primary_key=True)
    Song_Name=Column(String)
    Matched_Name=Column(String)
    Ratio_Value=Column(Integer)
    Artist=Column(String)
    Matched_Artist=Column(String)
    Artist_Ratio=Column(Integer)
    Artist_Partial=Column(Integer)
    Year=Column(Integer)
    Rank=Column(Integer)
    acousticness=Column(Integer) 
    danceability=Column(Integer)
    energy=Column(Integer)
    instrumentalness=Column(Integer)
    key=Column(Integer)
    liveness=Column(Integer)
    loudness=Column(Integer)
    audio_mode=Column(Integer)
    speechiness=Column(Integer)
    tempo=Column(Integer)
    time_signature=Column(Integer)
    audio_valence=Column(Integer)
    Artist_Ratio_Boolean=Column(String)
    

In [46]:
Base.metadata.tables

immutabledict({'billboard': Table('billboard', MetaData(bind=None), Column('id', Integer(), table=<billboard>, primary_key=True, nullable=False), Column('Rank', Integer(), table=<billboard>), Column('Song', String(), table=<billboard>), Column('Artist', String(), table=<billboard>), Column('Lyrics', String(), table=<billboard>), Column('Source', String(), table=<billboard>), schema=None), 'spotify_songs': Table('spotify_songs', MetaData(bind=None), Column('id', Integer(), table=<spotify_songs>, primary_key=True, nullable=False), Column('song_name', String(), table=<spotify_songs>), Column('artist_name', String(), table=<spotify_songs>), Column('album_names', String(), table=<spotify_songs>), Column('playlist', String(), table=<spotify_songs>), schema=None), 'spotify_song_data': Table('spotify_song_data', MetaData(bind=None), Column('id', Integer(), table=<spotify_song_data>, primary_key=True, nullable=False), Column('song_name', String(), table=<spotify_song_data>), Column('song_popula

In [47]:
Base.metadata.create_all(engine)

AttributeError: 'psycopg2.extensions.connection' object has no attribute '_run_visitor'

In [28]:
engine.table_names()

AttributeError: 'psycopg2.extensions.connection' object has no attribute 'table_names'

In [48]:
# Import dataframes into SQL database

billboard_df.to_sql(name='billboard', con=engine, if_exists='append', index=False)
song_df.to_sql(name='spotify_songs', con=engine, if_exists='append', index=False)
spotify_data_df.to_sql(name='spotify_song_data', con=engine, if_exists='append', index=False)
fuzzy_df.to_sql(name='fuzzy_key', con=engine, if_exists='append', index=False)
final_summary.to_sql(name='final_summary', con=engine, if_exists='append', index=False)

DatabaseError: Execution failed on sql 'SELECT name FROM sqlite_master WHERE type='table' AND name=?;': relation "sqlite_master" does not exist
LINE 1: SELECT name FROM sqlite_master WHERE type='table' AND name=?...
                         ^


In [None]:
#  Example to check work -> pd.read_sql_query('select * from customer_name', con=engine).head()