In [1]:
#Import Dependencies & file paths
# Fuzzywuzzy requires installing fuzzywuzzy and python-levenshtein libraries

import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
billboard_csv = '../Resources/billboard_lyrics_1964-2015.csv'
song_list_csv = '../Resources/song_info.csv'
song_data_csv = '../Resources/song_data.csv'

In [2]:
# Create dataframes from csv files - Billboard top 100 songs - 1964 through 2015
billboard_df = pd.read_csv(billboard_csv, encoding = 'ISO-8859-1')
billboard_df.head()

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0


In [3]:
# Create dataframes from csv files - 19,000 Spotify songs with album

song_df = pd.read_csv(song_list_csv)
song_df.head()

Unnamed: 0,song_name,artist_name,album_names,playlist
0,Boulevard of Broken Dreams,Green Day,Greatest Hits: God's Favorite Band,00s Rock Anthems
1,In The End,Linkin Park,Hybrid Theory,00s Rock Anthems
2,Seven Nation Army,The White Stripes,Elephant,00s Rock Anthems
3,By The Way,Red Hot Chili Peppers,By The Way (Deluxe Version),00s Rock Anthems
4,How You Remind Me,Nickelback,Silver Side Up,00s Rock Anthems


In [4]:
# Create dataframes from csv files - 19,000 Spotify songs with data

spotify_data_df = pd.read_csv(song_data_csv, encoding = 'ISO-8859-1')
spotify_data_df.head()

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,Boulevard of Broken Dreams,73,262333,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474
1,In The End,66,216933,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37
2,Seven Nation Army,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324
3,By The Way,74,216933,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.0,10,0.113,-5.065,1,0.0313,172.011,4,0.574


In [5]:
# Add columns to both dataframes with lowercase song names for better matching

billboard_df['fuzzy_name'] = billboard_df['Song'].apply(lambda x: x.lower())
song_df['fuzzy_name'] = song_df['song_name'].apply(lambda x: x.lower())
billboard_df['fuzzy_artist'] = billboard_df['Artist'].apply(lambda x: x.lower())
song_df['fuzzy_artist'] = song_df['artist_name'].apply(lambda x: x.lower())

In [6]:
# Remove punctuation from lowercase song names to help with better matching

trans_table = str.maketrans('','',string.punctuation)
billboard_df['fuzzy_name'] = billboard_df['fuzzy_name'].apply(lambda x: x.translate(trans_table))
song_df['fuzzy_name'] = song_df['fuzzy_name'].apply(lambda x: x.translate(trans_table))
billboard_df['fuzzy_artist'] = billboard_df['fuzzy_artist'].apply(lambda x: x.translate(trans_table))
song_df['fuzzy_artist'] = song_df['fuzzy_artist'].apply(lambda x: x.translate(trans_table))


In [7]:
# Export billboard_df as a separate csv file to be used in compiling notebook

billboard_df.to_csv('../Resources/billboard_df.csv',index=False)

In [8]:
# Testing fuzzywuzzy on a small sample size to see how it works

query = billboard_df['fuzzy_name'][2]
choices = song_df['fuzzy_name']

# ExtractOne only yields top result

result = process.extractOne(query,choices)
result

('i cant get no satisfaction', 100, 251)

In [9]:
# Show top 5 matches for same song, to look at ratio scores

results = process.extract(query,choices)
results

[('i cant get no satisfaction', 100, 251),
 ('i can', 90, 3751),
 ('satisfaction', 90, 5709),
 ('i can', 90, 9356),
 ('cant stop', 86, 13)]

In [10]:
# Double check index provided by fuzzywuzzy to ensure data integrity

song_df.loc[251]

song_name            (I Can't Get No) Satisfaction
artist_name                     The Rolling Stones
album_names     Hot Rocks (1964-1971) [Remastered]
playlist                          60s Rock Anthems
fuzzy_name              i cant get no satisfaction
fuzzy_artist                    the rolling stones
Name: 251, dtype: object

In [11]:
#### TAKES A LONG TIME ####

# Loop through Billboard dataframe and fuzzy match to list of spotify songs

# Initalize lists to hold query songs as well as the fuzzy mathced songs

query_name = []
fuzzy_list = []

# Set up 'choices' parameter of fuzzywuzzy to tell the module what songs to look through to match our query

choices = song_df['fuzzy_name']

# For loop to loop through billboard songs and fuzzmatch to spotify list

for i in range(len(billboard_df)):
    
    query = billboard_df['fuzzy_name'][i]
    
    query_name.append(billboard_df['fuzzy_name'][i])
    
    # Simple ratio checks for the similarty of the two strings based on Levenshtein distance
    fuzzy_list.append(process.extractOne(query,choices,scorer=fuzz.ratio))


In [12]:
# Check results of same song as above to ensure similar result following the loop

fuzzy_list

[('only lonely', 64, 3209),
 ('i cant help myself sugar pie honey bunch', 100, 3863),
 ('i cant get no satisfaction', 100, 251),
 ('youre on my mind', 91, 11156),
 ('youve got the love', 70, 1959),
 ('london town', 74, 13871),
 ('help', 100, 14944),
 ('you are my heart', 70, 6881),
 ('crying in the club', 84, 4681),
 ('my girl', 100, 2045),
 ('noche de ronda', 71, 1348),
 ('on the road', 74, 1831),
 ('the hand that feeds', 68, 12914),
 ('tell me that you miss me', 61, 16455),
 ('shotgun', 100, 1161),
 ('i want you back', 69, 1175),
 ('this mountain', 67, 4779),
 ('the high road', 72, 16698),
 ('where do you go to my lovely', 58, 2112),
 ('stop in the name of love', 100, 6867),
 ('unchained melody', 100, 2063),
 ('silhouettes', 100, 5847),
 ('never forget you', 62, 1199),
 ('caravan', 67, 492),
 ('mr tambourine man', 100, 324),
 ('set fire to the rain', 65, 1372),
 ('es mi reina', 70, 3920),
 ('whats the use', 65, 11574),
 ('self destruction', 82, 4097),
 ('hang loose', 67, 3442),
 ('si

In [13]:
# Because fuzzy_list is a list of tuples, we need to iterate through the list to parse out the tuples into seperate lists

matched_name = []
ratio_value = []
song_index = []

for i in range(len(fuzzy_list)):
    
    matched_name.append(fuzzy_list[i][0])
    ratio_value.append(fuzzy_list[i][1])
    song_index.append(fuzzy_list[i][2])


In [14]:
# Create a seperate dictionary to hold all results

fuzzy_dict = {'Song_Name':query_name,
             'Matched_Name':matched_name,
             'Ratio_Value':ratio_value,
             'Song_List_Index':song_index}

# Create dataframe from dictionary 

fuzzy_df = pd.DataFrame(fuzzy_dict)

In [11]:
# Preview dataframe of resuls, importing csv of results after the first time we ran, since fuzzy matching takes forever

# fuzzy_csv = '../Resources/fuzzy_df.csv'
# fuzzy_df = pd.read_csv(fuzzy_csv)
fuzzy_df.head()


Unnamed: 0,Song_Name,Matched_Name,Artist,Ratio_Value,Song_List_Index
0,wooly bully,only lonely,sam the sham and the pharaohs,64,3209
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,four tops,100,3863
2,i cant get no satisfaction,i cant get no satisfaction,the rolling stones,100,251
3,you were on my mind,youre on my mind,we five,91,11156
4,youve lost that lovin feelin,youve got the love,the righteous brothers,70,1959


In [12]:
# Export dataframe for use in merging and cleaning notebook

fuzzy_df.to_csv('../Resources/fuzzy_df.csv',index=False)

# Go to merging_cleaning notebook from here

In [13]:
# The rest of this takes place after the merging_cleaning notebook is complete
# Import composite_df that has all merges and joins

composite_df = pd.read_csv('../Resources/composite_df.csv')
composite_df

Unnamed: 0,Song_Name,Matched_Name,Artist,Matched_Artist,Ratio_Value,Year,Rank,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,wooly bully,only lonely,sam the sham and the pharaohs,the ballroom thieves,64,1965,1,0.898000,0.412,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.214
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,four tops,four tops,100,1965,2,0.245000,0.667,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971
2,i cant get no satisfaction,i cant get no satisfaction,the rolling stones,the rolling stones,100,1965,3,0.000380,0.735,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.859
3,you were on my mind,youre on my mind,we five,nokiaa,91,1965,4,0.814000,0.651,0.128,0.908000,2,0.0596,-14.998,1,0.0511,160.001,4,0.238
4,youve lost that lovin feelin,youve got the love,the righteous brothers,florence the machine,70,1965,5,0.004070,0.571,0.708,0.000000,6,0.1250,-4.740,1,0.0329,109.899,4,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,el perdon,el perdón,nicky jam and enrique iglesias,nicky jam,89,2015,96,0.446000,0.628,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.662
5096,she knows,heaven knows,neyo featuring juicy j,gina chavez,76,2015,97,0.035100,0.750,0.564,0.000069,9,0.1120,-7.956,0,0.0462,96.969,4,0.264
5097,night changes,dont change,one direction,inxs,75,2015,98,0.000059,0.206,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.155
5098,back to back,back to back,drake,drake,100,2015,99,0.008980,0.635,0.472,0.000000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373


In [14]:
# Perform fuzzy matching on Artist and Matched_Artist to see if the song matching was correct
# Loop through composite dataframe and fuzzy match each row's Artist and Matched_Artist
# Initalize lists to hold ratio values for normal and partial ratios

fuzzy_ratio = []
fuzzy_partial = []

# For loop to loop through billboard songs and fuzzmatch to spotify list

for i in range(len(composite_df)):
    
    artist = composite_df['Artist'][i]
    matched_artist = composite_df['Matched_Artist'][i]
    fuzzy_ratio.append(fuzz.ratio(artist,matched_artist))
    fuzzy_partial.append(fuzz.partial_ratio(artist,matched_artist))

In [15]:
# List checks
# fuzzy_ratio
# fuzzy_partial

# Add values to dataframe

composite_df['Artist_Ratio'] = fuzzy_ratio
composite_df['Artist_Partial'] = fuzzy_partial
composite_df

Unnamed: 0,Song_Name,Matched_Name,Artist,Matched_Artist,Ratio_Value,Year,Rank,acousticness,danceability,energy,...,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Artist_Ratio,Artist_Partial
0,wooly bully,only lonely,sam the sham and the pharaohs,the ballroom thieves,64,1965,1,0.898000,0.412,0.276,...,7,0.0955,-11.711,1,0.0296,90.781,4,0.214,45,50
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,four tops,four tops,100,1965,2,0.245000,0.667,0.599,...,0,0.1070,-8.894,1,0.0291,127.935,4,0.971,100,100
2,i cant get no satisfaction,i cant get no satisfaction,the rolling stones,the rolling stones,100,1965,3,0.000380,0.735,0.820,...,2,0.0983,-8.554,1,0.0494,135.586,4,0.859,100,100
3,you were on my mind,youre on my mind,we five,nokiaa,91,1965,4,0.814000,0.651,0.128,...,2,0.0596,-14.998,1,0.0511,160.001,4,0.238,15,17
4,youve lost that lovin feelin,youve got the love,the righteous brothers,florence the machine,70,1965,5,0.004070,0.571,0.708,...,6,0.1250,-4.740,1,0.0329,109.899,4,0.398,33,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,el perdon,el perdón,nicky jam and enrique iglesias,nicky jam,89,2015,96,0.446000,0.628,0.690,...,8,0.1120,-5.828,1,0.0511,179.911,4,0.662,46,100
5096,she knows,heaven knows,neyo featuring juicy j,gina chavez,76,2015,97,0.035100,0.750,0.564,...,9,0.1120,-7.956,0,0.0462,96.969,4,0.264,24,36
5097,night changes,dont change,one direction,inxs,75,2015,98,0.000059,0.206,0.810,...,9,0.0855,-6.492,1,0.0423,163.552,4,0.155,24,50
5098,back to back,back to back,drake,drake,100,2015,99,0.008980,0.635,0.472,...,8,0.1170,-3.869,0,0.5110,174.008,4,0.373,100,100


In [16]:
composite_df = composite_df[['Song_Name','Matched_Name','Ratio_Value','Artist','Matched_Artist','Artist_Ratio','Artist_Partial',
                             'Year','Rank','acousticness','danceability','energy','instrumentalness','key',
                             'liveness','loudness','audio_mode','speechiness','tempo','time_signature','audio_valence']]
composite_df

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,wooly bully,only lonely,64,sam the sham and the pharaohs,the ballroom thieves,45,50,1965,1,0.898000,...,0.276,0.000372,7,0.0955,-11.711,1,0.0296,90.781,4,0.214
1,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.245000,...,0.599,0.000000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971
2,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.000380,...,0.820,0.057200,2,0.0983,-8.554,1,0.0494,135.586,4,0.859
3,you were on my mind,youre on my mind,91,we five,nokiaa,15,17,1965,4,0.814000,...,0.128,0.908000,2,0.0596,-14.998,1,0.0511,160.001,4,0.238
4,youve lost that lovin feelin,youve got the love,70,the righteous brothers,florence the machine,33,33,1965,5,0.004070,...,0.708,0.000000,6,0.1250,-4.740,1,0.0329,109.899,4,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5095,el perdon,el perdón,89,nicky jam and enrique iglesias,nicky jam,46,100,2015,96,0.446000,...,0.690,0.000000,8,0.1120,-5.828,1,0.0511,179.911,4,0.662
5096,she knows,heaven knows,76,neyo featuring juicy j,gina chavez,24,36,2015,97,0.035100,...,0.564,0.000069,9,0.1120,-7.956,0,0.0462,96.969,4,0.264
5097,night changes,dont change,75,one direction,inxs,24,50,2015,98,0.000059,...,0.810,0.064400,9,0.0855,-6.492,1,0.0423,163.552,4,0.155
5098,back to back,back to back,100,drake,drake,100,100,2015,99,0.008980,...,0.472,0.000000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373


In [17]:
# Filter out songs based on a ratio limit

ratio_limit = 80
high_ratio_songs = composite_df[composite_df['Ratio_Value']>ratio_limit].reset_index(drop=True)

In [18]:
high_ratio_songs.head()

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.245,...,0.599,0.0,0,0.107,-8.894,1,0.0291,127.935,4,0.971
1,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.00038,...,0.82,0.0572,2,0.0983,-8.554,1,0.0494,135.586,4,0.859
2,you were on my mind,youre on my mind,91,we five,nokiaa,15,17,1965,4,0.814,...,0.128,0.908,2,0.0596,-14.998,1,0.0511,160.001,4,0.238
3,help,help,100,the beatles,papa roach,19,20,1965,7,0.00085,...,0.9,2e-06,7,0.271,-5.42,1,0.134,183.96,4,0.286
4,crying in the chapel,crying in the club,84,elvis presley,camila cabello,30,31,1965,9,0.334,...,0.617,0.0,6,0.163,-5.834,0,0.383,85.227,4,0.603


In [19]:
# Create boolean column for whether the artist ratios deem a song match good or not

artist_limit = 74
artist_ratio_boolean = []
for i in range(len(high_ratio_songs)):
    if high_ratio_songs['Artist_Ratio'][i] > 49 or high_ratio_songs['Artist_Partial'][i] > 49:
        artist_ratio_boolean.append(True)
    else:
        artist_ratio_boolean.append(False)
high_ratio_songs['Artist_Ratio_Boolean'] = artist_ratio_boolean

In [20]:
high_ratio_songs.head()

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Artist_Ratio_Boolean
0,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.245,...,0.0,0,0.107,-8.894,1,0.0291,127.935,4,0.971,True
1,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.00038,...,0.0572,2,0.0983,-8.554,1,0.0494,135.586,4,0.859,True
2,you were on my mind,youre on my mind,91,we five,nokiaa,15,17,1965,4,0.814,...,0.908,2,0.0596,-14.998,1,0.0511,160.001,4,0.238,False
3,help,help,100,the beatles,papa roach,19,20,1965,7,0.00085,...,2e-06,7,0.271,-5.42,1,0.134,183.96,4,0.286,False
4,crying in the chapel,crying in the club,84,elvis presley,camila cabello,30,31,1965,9,0.334,...,0.0,6,0.163,-5.834,0,0.383,85.227,4,0.603,False


In [38]:
# Filter again, only keeping songs that both met our ratio limit as well as artist ratio limits

final_summary = high_ratio_songs[high_ratio_songs['Artist_Ratio_Boolean'] == True].reset_index(drop=True)
final_summary

Unnamed: 0,Song_Name,Matched_Name,Ratio_Value,Artist,Matched_Artist,Artist_Ratio,Artist_Partial,Year,Rank,acousticness,...,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Artist_Ratio_Boolean
0,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,four tops,four tops,100,100,1965,2,0.24500,...,0.00000,0,0.1070,-8.894,1,0.0291,127.935,4,0.971,True
1,i cant get no satisfaction,i cant get no satisfaction,100,the rolling stones,the rolling stones,100,100,1965,3,0.00038,...,0.05720,2,0.0983,-8.554,1,0.0494,135.586,4,0.859,True
2,my girl,my girl,100,the temptations,the temptations,100,100,1965,10,0.54500,...,0.00000,0,0.5590,-5.714,1,0.0691,104.451,4,0.598,True
3,stop in the name of love,stop in the name of love,100,the supremes,the supremes,100,100,1965,20,0.79900,...,0.00000,0,0.1150,-5.832,1,0.0287,115.884,4,0.504,True
4,unchained melody,unchained melody,100,the righteous brothers,the righteous brothers,100,100,1965,21,0.51700,...,0.00000,0,0.6180,-16.652,1,0.0288,98.742,3,0.266,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,chandelier,chandelier,100,sia,kina grannis,27,67,2015,90,0.83500,...,0.00000,5,0.1140,-13.578,0,0.0320,75.874,4,0.127,True
1109,heartbeat song,heartbeat song,100,kelly clarkson,kelly clarkson,100,100,2015,91,0.01200,...,0.00000,6,0.0645,-3.711,1,0.0505,149.093,4,0.444,True
1110,el perdon,el perdón,89,nicky jam and enrique iglesias,nicky jam,46,100,2015,96,0.44600,...,0.00000,8,0.1120,-5.828,1,0.0511,179.911,4,0.662,True
1111,back to back,back to back,100,drake,drake,100,100,2015,99,0.00898,...,0.00000,8,0.1170,-3.869,0,0.5110,174.008,4,0.373,True


In [22]:
# Import MySQL dependencies

from sqlalchemy import create_engine, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
import psycopg2

In [12]:
# Checking Columns of dfs I want to put into SQL 
# billboard_df.head()
# song_df.head()
# spotify_data_df.head()
# fuzzy_df.head()
# final_summary.head()

In [32]:
# Define our billboard, spotify_songs, spotify _song_data and fuzzy_key tables

class billboard(Base):
    __tablename__ = 'billboard'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    Rank=Column(Integer)
    Song=Column(String)
    Year=Column(Integer)
    Artist=Column(String)
    Lyrics=Column(String)
    Source=Column(String)
    fuzzy_name=Column(String)
    fuzzy_artist=Column(String)
    
class spotify_songs(Base):
    __tablename__ = 'spotify_songs'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    song_name=Column(String)
    artist_name=Column(String)
    album_names=Column(String)
    playlist=Column(String)
    fuzzy_name=Column(String)
    fuzzy_artist=Column(String)
    
class spotify_song_data(Base):
    __tablename__ = 'spotify_song_data'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    song_name=Column(String)
    song_popularity=Column(String)
    song_duration_ms=Column(String)
    acousticness=Column(Float) 
    danceability=Column(Float)
    energy=Column(Float)
    instrumentalness=Column(Float)
    key=Column(Integer)
    liveness=Column(Float)
    loudness=Column(Float)
    audio_mode=Column(Integer)
    speechiness=Column(Float)
    tempo=Column(Float)
    time_signature=Column(Integer)
    audio_valence=Column(Float)
    
    
class fuzzy_key(Base):
    __tablename__ = 'fuzzy_key'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    Song_Name=Column(String)
    Matched_Name=Column(String)
    Artist=Column(String)
    Ratio_Value=Column(Integer)    
    Song_List_Index=Column(Integer)
    
class final_summary(Base):
    __tablename__ = 'final_summary'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    Song_Name=Column(String)
    Matched_Name=Column(String)
    Ratio_Value=Column(Integer)
    Artist=Column(String)
    Matched_Artist=Column(String)
    Artist_Ratio=Column(Integer)
    Artist_Partial=Column(Integer)
    Year=Column(Integer)
    Rank=Column(Integer)
    acousticness=Column(Float) 
    danceability=Column(Float)
    energy=Column(Float)
    instrumentalness=Column(Float)
    key=Column(Integer)
    liveness=Column(Float)
    loudness=Column(Float)
    audio_mode=Column(Integer)
    speechiness=Column(Float)
    tempo=Column(Float)
    time_signature=Column(Integer)
    audio_valence=Column(Float)
    Artist_Ratio_Boolean=Column(String)

  % (item.__module__, item.__name__)
  % (item.__module__, item.__name__)
  % (item.__module__, item.__name__)
  % (item.__module__, item.__name__)


In [33]:
# Reflect 

Base.metadata.tables

immutabledict({'billboard': Table('billboard', MetaData(bind=None), Column('id', Integer(), table=<billboard>, primary_key=True, nullable=False), Column('Rank', Integer(), table=<billboard>), Column('Song', String(), table=<billboard>), Column('Year', Integer(), table=<billboard>), Column('Artist', String(), table=<billboard>), Column('Lyrics', String(), table=<billboard>), Column('Source', String(), table=<billboard>), Column('fuzzy_name', String(), table=<billboard>), Column('fuzzy_artist', String(), table=<billboard>), schema=None), 'spotify_songs': Table('spotify_songs', MetaData(bind=None), Column('id', Integer(), table=<spotify_songs>, primary_key=True, nullable=False), Column('song_name', String(), table=<spotify_songs>), Column('artist_name', String(), table=<spotify_songs>), Column('album_names', String(), table=<spotify_songs>), Column('playlist', String(), table=<spotify_songs>), Column('fuzzy_name', String(), table=<spotify_songs>), Column('fuzzy_artist', String(), table=<s

In [34]:
# Connect to ETL_Project_db in postgres, created prior to running code in PGAdmin

engine=create_engine(f'postgresql://postgres:postgres@localhost:5432/ETL_Project_db')

In [35]:
# Create tables

Base.metadata.create_all(engine)

In [36]:
# Check table_names

engine.table_names()

['billboard',
 'spotify_songs',
 'spotify_song_data',
 'fuzzy_key',
 'final_summary']

In [39]:
# Import dataframes into SQL database

# billboard_df.to_sql(name='billboard', con=engine, if_exists='append', index=False)
# song_df.to_sql(name='spotify_songs', con=engine, if_exists='append', index=False)
# spotify_data_df.to_sql(name='spotify_song_data', con=engine, if_exists='append', index=False)
# fuzzy_df.to_sql(name='fuzzy_key', con=engine, if_exists='append', index=False)
final_summary.to_sql(name='final_summary', con=engine, if_exists='append', index=False)

In [25]:
#  Example to check work -> 

pd.read_sql_query('select * from final_summary', con=engine).head()

Unnamed: 0,id,Song_Name,Matched_Name,Ratio_Value,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,Rank,Year
0,1,only the lonely,only lonely,85,1,0,0,0,7,0,-12,1,0,91,4,0,3,1965
1,2,i cant help myself sugar pie honey bunch,i cant help myself sugar pie honey bunch,100,0,1,1,0,0,0,-9,1,0,128,4,1,4,1965
2,3,i cant get no satisfaction,i cant get no satisfaction,100,0,1,1,0,2,0,-9,1,0,136,4,1,5,1965
3,4,you were on my mind,youre on my mind,91,1,1,0,1,2,0,-15,1,0,160,4,0,7,1965
4,5,you got the love,youve got the love,94,0,1,1,0,6,0,-5,1,0,110,4,0,11,1965
