In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import seaborn as sns

In [None]:
PATH_CSV = './top_3827348_songs.csv'
SAMPLE_SIZE = 100000
SEED = 42

In [None]:
def to_str(x):
    return x[2:-1]

def to_list(x):
    return [y for y in to_str(x).split(',')]

df = pd.read_csv(
    PATH_CSV,
    nrows=SAMPLE_SIZE,
    usecols=[
        'id', 'name', 'duration', 'explicit', 'popularity', 'loudness', 'tempo',
        'time_signature', 'key', 'mode', 'acousticness', 'danceability',
        'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence'
    ],
    converters={
        'id': to_str, 'name': to_str, 'duration': np.int32, 'explicit': bool,
        'popularity': np.int32, 'loudness': np.float32, 'tempo': np.float32,
        'time_signature': np.int32, 'key': np.int32, 'mode': bool, 'acousticness': np.float32,
        'danceability': np.float32, 'energy': np.float32, 'instrumentalness': np.float32,
        'liveness': np.float32, 'speechiness': np.float32, 'valence': np.float32
    }
)
df.head()

In [None]:
selected_cols = [
    'acousticness', 'danceability', 'duration',
    'energy', 'explicit', 'instrumentalness', 'key',
    'liveness', 'loudness', 'mode', 'popularity',
    'speechiness', 'tempo', 'time_signature', 'valence'
]

df = df[selected_cols]

In [None]:
df.corr(method='pearson', numeric_only=True)

In [None]:
df.corr(method='spearman', numeric_only=True)

In [None]:
sns.scatterplot(data=df.head(1000), x='instrumentalness', y='popularity')
plt.show()