In [17]:
import pandas as pd
from tabulate import tabulate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

data = pd.read_csv("/Users/michaeljeon/Desktop/INST414/Module 1 Assignment/dataset.csv")

clean_data = data.drop(['index', 'track_id', 'album_name', 'danceability', 
                        'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 
                        'liveness', 'valence', 'tempo', 'time_signature'], axis=1).dropna().drop_duplicates(subset="track_name", keep="first")

sorted_data = clean_data.sort_values(by='popularity', ascending=False)

print(tabulate(sorted_data.head(20), headers='keys'))
print("Total Number of Entries: ", len(sorted_data))


       artists                      track_name                                 popularity    duration_ms  explicit      energy  track_genre
-----  ---------------------------  ---------------------------------------  ------------  -------------  ----------  --------  -------------
20001  Sam Smith;Kim Petras         Unholy (feat. Kim Petras)                         100         156943  False          0.472  dance
51664  Bizarrap;Quevedo             Quevedo: Bzrp Music Sessions, Vol. 52              99         198937  False          0.782  hip-hop
20008  David Guetta;Bebe Rexha      I'm Good (Blue)                                    98         175238  True           0.965  dance
67356  Manuel Turizo                La Bachata                                         98         162637  False          0.679  latin
67359  Bad Bunny                    Tití Me Preguntó                                   97         243716  False          0.715  latin
67358  Bad Bunny;Chencho Corleone   Me Porto B

In [18]:
print(data.isnull().sum())
print(data.dtypes)
print("Duplicates: ", data.duplicated().sum())

index               0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64
index                 int64
track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               f

In [25]:
selected_genres = ['hip-hop', 'pop', 'dance']

possible_genre_cols = ['genre', 'genres', 'track_genre']
genre_column = None
for col in possible_genre_cols:
    if col in data.columns:
        genre_column = col
        break

if genre_column is None:
    raise ValueError("No genre column found.")


top_genre_data = data[data[genre_column].isin(selected_genres)].copy()

print("\nSelected Genres Used in Analysis:")
print(selected_genres)


genre_features = ['popularity', 'energy', 'danceability', 'tempo', 'valence']

top_genre_centroids = (
    top_genre_data.groupby(genre_column)[genre_features]
    .mean()
    .reset_index()
)

print("\nTOP GENRE CENTROIDS (RAW):")
print(tabulate(top_genre_centroids, headers="keys"))

scaler_top = StandardScaler()
scaled_centroids = scaler_top.fit_transform(top_genre_centroids[genre_features])

similarity_matrix = cosine_similarity(scaled_centroids)

similarity_df = pd.DataFrame(
    similarity_matrix,
    index=top_genre_centroids[genre_column],
    columns=top_genre_centroids[genre_column]
)

print("\nCosine Similarity Among SELECTED Genres:")
print(tabulate(similarity_df, headers="keys"))




pop_values = sorted_data[['popularity']].values

kmeans = KMeans(n_clusters=3, random_state=42)
sorted_data['popularity_cluster'] = kmeans.fit_predict(pop_values)

cluster_means = (
    sorted_data.groupby('popularity_cluster')['popularity']
    .mean()
    .sort_values()
)

cluster_map = {old:new for new, old in enumerate(cluster_means.index)}
sorted_data['popularity_cluster'] = sorted_data['popularity_cluster'].map(cluster_map)


cluster_summary = (
    sorted_data.groupby('popularity_cluster')['popularity']
    .agg(['mean', 'count'])
    .reset_index()
)

print("\n Popularity Clusters for SELECTED Genres:")
print(tabulate(cluster_summary, headers='keys'))

genre_cluster_distribution = (
    sorted_data.groupby([genre_column, 'popularity_cluster'])['popularity']
    .count()
    .reset_index()
    .rename(columns={'popularity': 'count'})
)

print("\nGenre Distribution Across Popularity Clusters:")
print(tabulate(genre_cluster_distribution, headers='keys'))



Selected Genres Used in Analysis:
['hip-hop', 'pop', 'dance']

TOP GENRE CENTROIDS (RAW):
    track_genre      popularity    energy    danceability    tempo    valence
--  -------------  ------------  --------  --------------  -------  ---------
 0  dance                22.69   0.708583        0.687856  120.128   0.552541
 1  hip-hop              37.759  0.68253         0.736154  116.77    0.551248
 2  pop                  47.576  0.606437        0.630441  120.927   0.506223

Cosine Similarity Among SELECTED Genres:
track_genre         dance     hip-hop        pop
-------------  ----------  ----------  ---------
dance           1           0.0217416  -0.689214
hip-hop         0.0217416   1          -0.739372
pop            -0.689214   -0.739372    1

 Popularity Clusters for SELECTED Genres:
      popularity_cluster     mean    count
--  --------------------  -------  -------
 0                     0  14.6696    27915
 1                     1  38.261     28580
 2                     2