In [1]:
# import the necessary packages
import pandas as pd
import re

In [2]:
# make the merging simpler by:
def clean_song(row):
    song = row.str.lower() # making all names lowercased
    song = str(song.str.split(' -')[0][0]) # taking out anything after - (like "- Radio Edition")
    song = re.sub(r" \(feat. .+(?= .tayl)", "", song) # take out anything with "feature"
    song = re.sub(r" \(feat. .+(?= \()", "", song) # take out extra parentheses
    song = re.sub(r" \(feat. .+", "", song) # take out remaining lines after feature
    song = song.replace("’", "'") # replace all ’ with '
    return song

In [4]:
# read in Spotify data
swift_1 = pd.read_table('../data/all_songs.csv')
swift_1 = swift_1.rename(columns={"name": "song"}) # rename columns to match
swift_1.song = swift_1.apply(clean_song, axis=1)

In [29]:
# read in Billboard Hot 100 data
swift_2 = pd.read_table("../data/long_df_swift.txt")
swift_2 = swift_2[swift_2['artist'].str.contains("Taylor Swift")]
swift_2['date'] = swift_2['date'].astype('datetime64[D]')
# swift_2 = pd.pivot(data=swift_2, columns=['date'], values='rank')
swift_2 = swift_2.pivot_table(columns='date', values='rank', index='song')
swift_2 = swift_2.reset_index()
swift_2.song = swift_2.apply(clean_song, axis=1)
swift_2 = swift_2.set_index(swift_2.song)
swift_2 = swift_2.drop(columns=['song'])
swift_2

date,2006-09-23,2006-09-30,2006-10-07,2006-10-14,2006-10-21,2006-10-28,2006-11-04,2006-11-11,2006-11-18,2006-11-25,...,2022-02-19,2022-02-26,2022-03-05,2022-03-12,2022-05-21,2022-07-09,2022-11-05,2022-11-12,2022-11-19,2022-11-26
song,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'tis the damn season,,,,,,,,,,,...,,,,,,,,,,
...ready for it?,,,,,,,,,,,...,,,,,,,,,,
22,,,,,,,,,,,...,,,,,,,,,,
22 (taylor's version),,,,,,,,,,,...,,,,,,,,,,
afterglow,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you belong with me,,,,,,,,,,,...,,,,,,,,,,
you belong with me (taylor's version),,,,,,,,,,,...,,,,,,,,,,
you need to calm down,,,,,,,,,,,...,,,,,,,,,,
you're not sorry,,,,,,,,,,,...,,,,,,,,,,


In [30]:
# join the tables on "song"
a = swift_1.join(swift_2, on='song', how='outer').reset_index()

In [31]:
# remove duplicate songs (keep only the most popular)
index = -1
to_remove = []
count = a['song'].value_counts()

for i in range(len(a)):
    song = a.iloc[i].song
    if count[song] > 1:
        if i + 1 < len(a):
            current_row = a.iloc[i]
            next_row = a.iloc[i+1]
            if current_row.song == next_row.song:
                if current_row.popularity < next_row.popularity:
                    to_remove.append(i)
                else:
                    to_remove.append(i+1)

b = a.drop(to_remove)

In [32]:
# melt and set index to date (one row for every song + date combination)
c = b.drop(columns=['index']).reset_index().drop(columns=['index'])
d = pd.melt(c, id_vars=c.columns[:6], var_name = 'date', value_name = 'rank')
f = d.dropna()
f.index = f.set_index('date').index.astype('datetime64[ns]')

In [34]:
# save as a csv
f.to_csv('../data/date_indexed_songs.csv', sep='\t', index=False)