In [51]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [52]:
spotify_df= pd.read_csv("Top Spotify Songs in 73 Countries (Daily Updated)_exported.csv")
# spotify_df.info()

In [53]:
# Getting PH and getting unique values
spotify_df['Count']=1
PHcountry_spotify= spotify_df.loc[(spotify_df['country'])=='PH'].drop_duplicates(subset="name")

# Getting US data and unique
UScountry_spotify= spotify_df.loc[(spotify_df['country'])== 'US'].drop_duplicates(subset='name')


# Getting KR data and unique
KR_spotfy= spotify_df.loc[(spotify_df['country'])=='KR'].drop_duplicates(subset='name')

spotify_df= pd.concat([PHcountry_spotify, UScountry_spotify, KR_spotfy])
spotify_df.sort_values(['popularity'], ascending=False, inplace=True)

data =spotify_df.reset_index(drop=True)
data = data.drop(['spotify_id', 'is_explicit', 'snapshot_date', 'album_name', 'album_release_date', 'country', 'Count', 'daily_rank','daily_movement','weekly_movement'], axis=1)
# data

In [54]:
danceability = data['popularity'].corr(data['danceability'])
energy = data['popularity'].corr(data['energy'])
key = data['popularity'].corr(data['key'])
loudness = data['popularity'].corr(data['loudness'])
speechiness = data['popularity'].corr(data['speechiness'])
acousticness = data['popularity'].corr(data['acousticness'])
instrumentalness = data['popularity'].corr(data['instrumentalness'])
liveness = data['popularity'].corr(data['liveness'])
valence = data['popularity'].corr(data['valence'])
tempo = data['popularity'].corr(data['tempo'])




In [55]:

audio_features = ['danceability', 'energy', 'key', 'loudness', 'speechiness',
                   'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Corresponding correlation coefficients
correlation_values = [danceability, energy, key, loudness, speechiness,
                      acousticness, instrumentalness, liveness, valence, tempo]


fig = px.bar(x=audio_features, y=correlation_values,
             labels={'x': 'Audio Features', 'y': 'Correlation Coefficient'},
             title='Correlation between Audio Features and Popularity',
             color=audio_features, color_discrete_sequence=px.colors.qualitative.Set3,
             text=[f'{value:.2f}' for value in correlation_values],  # Format text with 2 decimal places
             )


fig.show()

### preprocessing

In [56]:
# data = data.drop(['spotify_id', 'is_explicit', 'snapshot_date', 'album_name', 'album_release_date', 'country', 'Count', 'daily_rank','daily_movement','weekly_movement'], axis=1)
# data = data.drop(['spotify_id', 'is_explicit', 'snapshot_date', 'album_name', 'album_release_date', 'country','Count'], axis=1)

In [57]:
data['popularity'] =pd.qcut(data['popularity'], q=2, labels=[0,1])

In [58]:
def one_hot(df, col, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[col], prefix=prefix)
    df =pd.concat([df, dummies], axis=1)
    df = df.drop(col, axis =1)
    return df

In [59]:
data = one_hot(data, 'artists', 'artist')
data = one_hot(data, 'name', 'song')

In [60]:
X = data.drop('popularity', axis=1)
y = data['popularity']

In [61]:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [62]:
log_model = LogisticRegression()
dec_model = DecisionTreeClassifier()
mlp_model = MLPClassifier()
svm_model = SVC()
rf_model = RandomForestClassifier()

In [63]:
log_model.fit(X_train, y_train)
dec_model.fit(X_train, y_train)
mlp_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

In [64]:
log_acc = log_model.score(X_test, y_test)
dec_acc = dec_model.score(X_test, y_test)
mlp_acc = mlp_model.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)
rf_acc = rf_model.score(X_test, y_test)

In [65]:
lin_train = linear_model.score(X_train, y_train)
log_train = log_model.score(X_train, y_train)
dec_train = dec_model.score(X_train, y_train)
mlp_train = mlp_model.score(X_train, y_train)
svm_train = svm_model.score(X_train, y_train)


In [66]:

accuracy_values = [log_acc, dec_acc, mlp_acc, svm_acc, rf_acc]


fig = px.bar(
    x=["Logistic Regression", "Decision Tree", "Neural Network", "Support Vector Machine", "Random Forest"],
    y=accuracy_values,
    color=["Logistic Regression", "Decision Tree", "Neural Network", "Support Vector Machine", "Random Forest"],
    labels={'x': "Model", 'y': "Accuracy"},
    title="Model Accuracy Comparison",
    text=[f'{value:.2f}' for value in accuracy_values],  # Format text with 2 decimal places
    
)

fig.show()

In [67]:
# print("Linear Regression Test:", linear_acc)
# print("Logistic Regression Accuracy Test:", log_acc)
# print("Decision Tree Accuracy Test:", dec_acc)
# print("Neural Network Accuracy Test:", mlp_acc)
# print("Support Vector Machine Accuracy Test:", svm_acc)

# print("Linear Regression Train:", lin_train)
# print("Logistic Regression Train:", log_train)
# print("Decision Tree Train:", dec_train)
# print("Neural Network Train:", mlp_train)
# print("Support Vector Machine Train:", svm_train)
