In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

In [None]:
PATH_CSV = './top_3827348_songs.csv'
SAMPLE_SIZE = 2500
SEED = 42

In [None]:
def to_str(x):
    return x[2:-1]

def to_list(x):
    return [y for y in to_str(x).split(',')]

df = pd.read_csv(
    PATH_CSV,
    nrows=SAMPLE_SIZE,
    usecols=[
        'id', 'name', 'duration', 'explicit', 'popularity', 'loudness', 'tempo',
        'time_signature', 'key', 'mode', 'acousticness', 'danceability',
        'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence'
    ],
    converters={
        'id': to_str, 'name': to_str, 'duration': np.int32, 'explicit': bool,
        'popularity': np.int32, 'loudness': np.float32, 'tempo': np.float32,
        'time_signature': np.int32, 'key': np.int32, 'mode': bool, 'acousticness': np.float32,
        'danceability': np.float32, 'energy': np.float32, 'instrumentalness': np.float32,
        'liveness': np.float32, 'speechiness': np.float32, 'valence': np.float32
    }
)
df.head()

In [None]:
selected_cols = [
    'acousticness', 'danceability', 'duration',
    'energy', 'explicit', 'instrumentalness', 'key',
    'liveness', 'loudness', 'mode', 'speechiness',
    'tempo', 'time_signature', 'valence'
]
scaler = StandardScaler()
X_selected = scaler.fit_transform(df[selected_cols])

In [None]:
pca_music = PCA(n_components=2, random_state=SEED).fit(X_selected)

In [None]:
pca_music.explained_variance_

In [None]:
pca_music.explained_variance_ratio_

In [None]:
pd.DataFrame(
    pca_music.components_.T.round(3),
    index=selected_cols,
    columns=['PCA1', 'PCA2']
)

In [None]:
row_numbers = df.index[df['name'] == 'Dance (A$$)'].tolist()
row_numbers

In [None]:
X_music_pca = pca_music.transform(X_selected)

fig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(X_music_pca[:, 0], X_music_pca[:, 1])
for i in range(X_selected.shape[0]):
    plt.text(
        x=X_music_pca[i, 0] + 0.3,
        y=X_music_pca[i, 1] + 0.3,
        s=repr(df['name'][i]).replace('$', r'\$')[1:-1],
        fontdict=dict(color='red', size=5),
        bbox=dict(facecolor='yellow', alpha=0.5)
    )
plt.show()

In [None]:
target_name = 'Dance (A$$)'
row = df[df['name'] == target_name]
row

In [None]:
tsne = TSNE(verbose=1, random_state=SEED)
tsne_results = tsne.fit_transform(X_selected)
fig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(tsne_results[:, 0], y=tsne_results[:, 1])
for i in range(tsne_results.shape[0]):
    plt.text(
        x=tsne_results[i, 0] + 0.3,
        y=tsne_results[i, 1] + 0.3,
        s=repr(df['name'][i]).replace('$', r'\$')[1:-1],
        fontdict=dict(color='red', size=5),
        bbox=dict(facecolor='yellow', alpha=0.5)
    )
plt.show()