# Pre-Processing

In [81]:
import pandas as pd
import numpy as np

In [82]:
df_track = pd.read_csv('data_clean/track.csv')
df_album = pd.read_csv('data_clean/album.csv')
df_artist = pd.read_csv('data_clean/artist.csv')
df_rating = pd.read_csv('data_clean/rating.csv') # ignored on commit due to size
df_similar = pd.read_csv('data_clean/track_similarity.csv')

In [83]:
# Give me all track names by Taylor Swift

# Get the artist id of Taylor Swift
taylor_swift = df_artist[df_artist['name'] == 'Taylor Swift']
taylor_swift_id = taylor_swift.iloc[0]['artist_id']

# Get all tracks by Taylor Swift
tracks_by_taylor_swift = df_track[df_track['artist_id'] == taylor_swift_id]
track_names = tracks_by_taylor_swift['title']
print(track_names)

1                      Blank Space
8                     Shake It Off
71                           Style
112                      Bad Blood
186            Welcome To New York
                   ...            
11036           Lover Is Childlike
11037                  Just a Game
14815                     Umbrella
14816        A Place in This World
14817    Mary's Song (Oh My My My)
Name: title, Length: 95, dtype: object


In [84]:
# Get all albums by Taylor Swift
albums_by_taylor_swift = df_album[df_album['artist_id'] == taylor_swift_id]
album_names = albums_by_taylor_swift['name']
print(album_names)

0                                                   1989
4                                           Shake It Off
250              We Are Never Ever Getting Back Together
264                                          Red Karaoke
276                                                Ronan
303    The Hunger Games: Songs from District 12 and B...
552                                Today Was a Fairytale
775                                         Taylor Swift
790                                iTunes Live from SoHo
Name: name, dtype: object


In [85]:
# print the 20 artists with the most tracks
track_counts = df_track['artist_id'].value_counts()
artist_ids = track_counts.index[:20]
artist_counts = track_counts.values[:20]
artist_names = []
for artist_id in artist_ids:
    artist = df_artist[df_artist['artist_id'] == artist_id]
    artist_name = artist.iloc[0]['name']
    artist_names.append(artist_name)
print(artist_names)

['Glee Cast', 'Kanye West', 'Justin Bieber', 'The Beach Boys', 'Cloud Nothings', 'T.I.', 'The Beatles', 'Coldplay', 'Beastie Boys', 'Weezer', 'Calvin Harris', 'Aphex Twin', 'George Strait', 'Nine Inch Nails', 'The Smashing Pumpkins', 'Drake', 'Rihanna', 'Brad Paisley', 'Taylor Swift', 'Zac Efron']


In [86]:
# Compare all artist_id 1 tracks with all artist_id 2 tracks using track_similarity

# Get all tracks by taylor swift
taylor_tracks_ids = df_track[df_track['artist_id'] == taylor_swift_id]['track_id']

# print(taylor_tracks_ids.to_list())

# Compare all tracks by taylor swift with all tracks by Coldplay
coldplay = df_artist[df_artist['name'] == 'Hozier']
coldplay_id = coldplay.iloc[0]['artist_id']
coldplay_tracks_ids = df_track[df_track['artist_id'] == coldplay_id]['track_id']

# print(coldplay_tracks_ids.to_list())

# Get the similarity between all tracks by taylor swift and all tracks by Coldplay

for taylor_track_id in taylor_tracks_ids:
    for coldplay_track_id in coldplay_tracks_ids:    
        
        similarity = df_similar[(df_similar['track_id_1'] == taylor_track_id) & (df_similar['track_id_2'] == coldplay_track_id)]
        
        if not similarity.empty:
            # convert the ids to track names
            taylor_track_name = df_track[df_track['track_id'] == taylor_track_id].iloc[0]['title']
            coldplay_track_name = df_track[df_track['track_id'] == coldplay_track_id].iloc[0]['title']
            print(f'{taylor_track_name} - {coldplay_track_name} - {similarity.iloc[0]["sim_degree"]}')
            
    


Blank Space - Take Me To Church - 0.3324
Shake It Off - Take Me To Church - 0.373951
Style - Take Me To Church - 0.252599
Bad Blood - Take Me To Church - 0.244916
Welcome To New York - Take Me To Church - 0.264725
Wildest Dreams - Take Me To Church - 0.223395
Out Of The Woods - Take Me To Church - 0.302672
Red - Take Me To Church - 0.137104
Sweeter Than Fiction - Take Me To Church - 0.178207
22 - Take Me To Church - 0.161293
I Knew You Were Trouble. - Take Me To Church - 0.238344
Begin Again - Take Me To Church - 0.147872
We Are Never Ever Getting Back Together - Take Me To Church - 0.122391
The Moment I Knew - Take Me To Church - 0.0811865
I Almost Do - Take Me To Church - 0.0916304
All Too Well - Take Me To Church - 0.0909596
Stay Stay Stay - Take Me To Church - 0.0782035
State Of Grace - Take Me To Church - 0.122132
Ronan - Take Me To Church - 0.0904206
Eyes Open - Take Me To Church - 1.0
Ours - Take Me To Church - 0.0812652
Sparks Fly - Take Me To Church - 0.0899633
If This Was A M

There seems to be a problem with the albums... only 1989, Red and Taylor Swift are actually albums that exist... the rest are either special versions or individual musics (e.g. Shake it off and Ronan). I might have done an upsi during cleaning? To tired to find the hypothetical bug rn