In [1]:
import pandas as pd

df = pd.read_csv('../data/4_spotify_million_tracks.csv')
hot100_df = pd.read_csv('../data/1_hot100_cleaned.csv')


In [2]:
# Create isHot column
df['isHot'] = df['original_title'].isin(hot100_df['title']).astype(int)

In [3]:
display(df.head())

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,album_cover,genres,isHot
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008-03-14,14,150040,False,https://i.scdn.co/image/ab67616d0000b2739e6b95...,"['chanson', 'french pop', 'french rock', 'nouv...",0
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004-03-21,1,253000,False,https://i.scdn.co/image/ab67616d0000b27398d445...,"['chanson', 'french pop']",0
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011-02-01,3,240400,False,https://i.scdn.co/image/ab67616d0000b27353a906...,['medieval'],0
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007-06-12,1,138760,True,https://i.scdn.co/image/ab67616d0000b273e6d949...,"['canadian metal', 'canadian post-hardcore', '...",0
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022-10-07,0,199986,False,https://i.scdn.co/image/ab67616d0000b27349ea4d...,['pops orchestra'],0


In [4]:
# Clean up columns
df = df.drop(columns=['original_title', 'original_artist'])
df = df.rename(columns={
    'spotify_title': 'title',
    'spotify_artist': 'artist'
})

In [5]:
import ast

# Combine all genres into one list
all_genres = []
for genre_list in df['genres'].dropna():  # skip NaN values
    genres = ast.literal_eval(genre_list)  # convert string to list
    all_genres.extend(genres)

# Create a list of unique genres
unique_genres = list(set(all_genres))

print(f"Total number of unique genres: {len(unique_genres)}")
print("\nFirst 10 genres as example:")
print(unique_genres[:10])

Total number of unique genres: 1532

First 10 genres as example:
['polish black metal', 'australian alternative rock', 'deep house', 'canadian hardcore', 'ebm', 'funeral doom', 'contemporary country', 'classic girl group', 'icelandic electronic', 'lafayette indie']


In [6]:
# 1. Get all unique genres (no threshold filtering)
all_genres = []
for genre_list in df['genres'].dropna():
    genres = genre_list.split(', ')
    genres = [genre.strip() for genre in genres]
    all_genres.extend(genres)

unique_genres = sorted(list(set(all_genres)))  # Get all unique genres and sort them

# 2. Create one-hot encoding for all genres
def encode_all_genres(genre_string):
    if pd.isna(genre_string):
        return [0] * len(unique_genres)
    
    genre_list = genre_string.split(', ')
    return [1 if genre in genre_list else 0 for genre in unique_genres]

# Create encoded columns
genre_encoded = pd.DataFrame(
    df['genres'].apply(encode_all_genres).tolist(),
    columns=unique_genres
)

# Add encoded columns to original dataframe
df_spotify_songs_encoded_full = pd.concat([df, genre_encoded], axis=1)

# Verify the number of columns
print(f"Total number of columns: {len(df_spotify_songs_encoded_full.columns)}")
print(f"Number of genre columns: {len(unique_genres)}")

Total number of columns: 2377
Number of genre columns: 2367


In [7]:
df = df.dropna(subset=['genres'])  # only drops rows where 'genres' is null

In [8]:
df.isnull().sum()

title           0
artist          0
album           0
release_date    0
popularity      0
duration_ms     0
explicit        0
album_cover     1
genres          0
isHot           0
dtype: int64

In [9]:
# drop non-numerical columns
drop_columns = ['title', 'artist', 'album', 'genres', 'album_cover']
df_spotify_numerical = df_spotify_songs_encoded_full.drop(columns= drop_columns).copy()

# Convert explicit boolean to int (0 or 1)
df_spotify_numerical['explicit'] = df_spotify_numerical['explicit'].astype(int)

# Convert release_date to year
df_spotify_numerical['release_date'] = df_spotify_numerical['release_date'].str.split('-').str[0]

# Display the first few rows of the new dataframe
print("Shape of numerical dataframe:", df_spotify_numerical.shape)
df_spotify_numerical.head()

Shape of numerical dataframe: (5060, 2372)


Unnamed: 0,release_date,popularity,duration_ms,explicit,isHot,"""children's music""]","""man's orchestra""]","""punk 'n' roll""]","""women's music""]",'album rock',...,['western swing'],['workout product'],['world chill'],['world fusion'],['yacht rock'],['zarzuela'],['zilizopendwa'],['zouk',['zouk'],['zydeco']
0,2008,14,150040,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2004,1,253000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2011,3,240400,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2007,1,138760,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2022,0,199986,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Clean up column names by removing quotes and brackets
df_spotify_numerical.columns = df_spotify_numerical.columns.str.replace('"', '').str.replace("'", '').str.replace('[', '').str.replace(']', '')

# Verify the cleaned column names
print("Cleaned column names:")
print(df_spotify_numerical.columns.tolist()[:20])  # Show first 10 columns as example

Cleaned column names:
['release_date', 'popularity', 'duration_ms', 'explicit', 'isHot', 'childrens music', 'mans orchestra', 'punk n roll', 'womens music', 'album rock', 'alternative hip hop', 'alternative hip hop', 'alternative metal', 'alternative rock', 'ambient', 'american contemporary classical', 'american modern classical', 'anti-folk', 'appalachian folk', 'argentine rock']
