In [None]:
# import the necessary packages
import pandas as pd
import re

In [None]:
# make the merging simpler by:
def clean_song(row):
    song = row.str.lower() # making all names lowercased
    song = str(song.str.split(' -')[0][0]) # taking out anything after - (like "- Radio Edition")
    song = re.sub(r" \(feat. .+(?= .tayl)", "", song) # take out anything with "feature"
    song = re.sub(r" \(feat. .+(?= \()", "", song) # take out extra parentheses
    song = re.sub(r" \(feat. .+", "", song) # take out remaining lines after feature
    song = song.replace("’", "'") # replace all ’ with '
    return song

In [None]:
# read in Spotify data
swift_1 = pd.read_table('all_songs.csv')
swift_1 = swift_1.rename(columns={"name": "song"}) # rename columns to match
swift_1.song = swift_1.apply(clean_song, axis=1)

In [None]:
# read in Billboard Hot 100 data
swift_2 = pd.read_table("../billboardHot100/long_df_swift.txt")
swift_2 = swift_2[swift_2['artist'].str.contains("Taylor Swift")]
swift_2['date'] = swift_2['date'].astype('datetime64[D]')
swift_2 = pd.pivot(data=swift_2, index="song", columns=['date'], values='rank')
swift_2 = swift_2.reset_index()
swift_2.song = swift_2.apply(clean_song, axis=1)
swift_2 = swift_2.set_index(swift_2.song)
swift_2 = swift_2.drop(columns=['song'])

In [None]:
# join the tables on "song"
a = swift_1.join(swift_2, on='song', how='outer').reset_index()

In [None]:
# remove duplicate songs (keep only the most popular)
index = -1
to_remove = []
count = a['song'].value_counts()

for i in range(len(a)):
    song = a.iloc[i].song
    if count[song] > 1:
        if i + 1 < len(a):
            current_row = a.iloc[i]
            next_row = a.iloc[i+1]
            if current_row.song == next_row.song:
                if current_row.popularity < next_row.popularity:
                    to_remove.append(i)
                else:
                    to_remove.append(i+1)

b = a.drop(to_remove)

In [None]:
# melt and set index to date (one row for every song + date combination)
c = b.drop(columns=['index']).reset_index().drop(columns=['index'])
d = pd.melt(c, id_vars=c.columns[:6], var_name = 'date', value_name = 'rank')
f = d.dropna()
f.index = f.set_index('date').index.astype('datetime64[ns]')

In [None]:
# save as a csv
f.to_csv('date_indexed_songs.csv', sep='\t', index=False)