In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
# Settings
SEED = 42
SAMPLE_SIZE = 200
PATH_CSV = './top_9824_songs.csv'
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv(PATH_CSV, nrows=SAMPLE_SIZE)
df.columns

In [None]:
selected_cols = [
    'duration', 'explicit', 'acousticness', 'danceability',
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness',
    'speechiness','tempo', 'time_signature', 'valence'
]

In [None]:
scaler = StandardScaler()
X_selected = scaler.fit_transform(df[selected_cols])

In [None]:
pca_music = PCA(n_components=2, random_state=SEED).fit(X_selected)

In [None]:
pca_music.explained_variance_

In [None]:
pca_music.explained_variance_ratio_

In [None]:
round(
    pd.DataFrame(
        pca_music.components_.T,
        index=selected_cols,
        columns=['PCA1', 'PCA2']
    ),
    3
)

In [None]:
df['name_cleaned'] = df['name'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)

In [None]:
X_music_pca = pca_music.transform(X_selected)

fig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(X_music_pca[:, 0], X_music_pca[:, 1])
for i in range(X_selected.shape[0]):
    plt.text(
        x=X_music_pca[i, 0] + 0.3,
        y=X_music_pca[i, 1] + 0.3,
        s=df['name_cleaned'][i],
        fontdict=dict(color='red', size=5),
        bbox=dict(facecolor='yellow', alpha=0.5)
    )
plt.show()

In [None]:
target_name = 'Blame on Me'
target_name = 'Shine On You Crazy Diamond Pts 15'
row = df[df['name_cleaned'] == target_name]
row

In [None]:
tsne = TSNE(verbose=1, random_state=SEED)
tsne_results = tsne.fit_transform(X_selected)
fig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(tsne_results[:, 0], y=tsne_results[:, 1])
for i in range(tsne_results.shape[0]):
    plt.text(
        x=tsne_results[i, 0] + 0.3,
        y=tsne_results[i, 1] + 0.3,
        s=df['name_cleaned'][i],
        fontdict=dict(color='red', size=5),
        bbox=dict(facecolor='yellow', alpha=0.5)
    )
plt.show()