<a href="https://colab.research.google.com/github/nywkim/project/blob/main/project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

sp = pd.read_csv('data_ind.csv', usecols=['acousticness', 'danceability', 'duration_ms', 'energy', 'liveness', 'loudness', 'speechiness',
                'tempo', 'valence', 'song_title', 'artist', 'target', 'key'])
sp.columns = ['Acousticness', 'Danceability', 'Duration (ms)', 'Energy', 'Chord',
       'Liveness', 'Loudness', 'Speechiness', 'Tempo', 'Valence', 'target', 'Song Name', 'Artist']
test = pd.read_csv('spotify_dataset.csv', usecols=['Acousticness', 'Danceability', 'Duration (ms)', 'Energy',
       'Liveness', 'Loudness', 'Speechiness', 'Tempo', 'Valence', 'Song Name', 'Artist', 'Chord'])
test['Chord'] = test.Chord.replace(['C', 'C#/Db', 'D', 'D#/Eb', 'E', 'F', 'F#/Gb', 'G', 'G#/Ab', 'A', 'A#/Bb', 'B'], [0,1,2,3,4,5,6,7,8,9,10,11])


In [None]:
sp.drop_duplicates(subset=None,inplace=True)
spotify = sp.drop(columns=['target', 'Song Name', 'Artist'], axis=1)

plt.figure(figsize=(10,6))
sns.heatmap(spotify.corr(),annot=True,cmap='summer')
plt.show()

In [None]:
import plotly.express as px

target = 'target'
sp_t = sp[target].value_counts()
px.pie(sp, values=sp_t, names=['liked','disliked'], title="Liked/Unliked Songs Distribution Pie Chart", 
       color_discrete_sequence=["#1A466C", "#81292b"])

In [None]:
sp.describe().T

In [None]:
sp_s = sp.sort_index(axis=1)
test_s = test.sort_index(axis=1)

sp = sp_s.drop(columns=['Song Name', 'Artist', 'target'], axis=1)
X_test = test_s
sp_ = sp_s[['target','Song Name', 'Artist']]


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def vif_show(X_vif):
    vif = pd.DataFrame()
    vif['Features'] = X_vif.columns
    vif['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    print(vif)
    print()
    if(vif.iloc[0,1] > 5.0 ):
        col = vif.iloc[0,0]
        X_vif.drop([vif.iloc[0,0]],axis =1, inplace = True)
        print("After removing \""+ col + "\" from datafame")
        vif_show(X_vif)

vif_show(sp)

In [None]:
sp = pd.concat([sp,sp_],axis = 1)
train, val = train_test_split(sp, test_size=0.2, random_state=17)
features = sp.drop(columns=[target]).columns
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree, ensemble
from xgboost import XGBClassifier
!pip install catboost
from catboost import CatBoostClassifier
import xgboost as xgboost
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score

In [None]:
obj=list(np.where(X_train.dtypes == np.object)[0])
model=CatBoostClassifier()
model.fit(X_train,y_train,cat_features=obj)
y_pred = model.predict(X_val)
print(classification_report(y_val,y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix
fig, ax = plt.subplots()
pcm = plot_confusion_matrix(model, X_val, y_val,
                            cmap=plt.cm.Blues,
                            ax=ax,values_format = '');
plt.title(f'Confusion matrix, n = {len(y_val)}', fontsize=15)

In [None]:
y_pred_proba = model.predict_proba(X_val)[:, -1]
print('AUC score: ', roc_auc_score(y_val, y_pred_proba))

In [None]:
a = pd.DataFrame(data=[y_pred,y_pred_proba]).T
a

In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)

plt.scatter(fpr, tpr, color='blue')
plt.plot(fpr, tpr, color='green')
plt.title('ROC curve')
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
imp_df = pd.DataFrame({
    "ColumnName": X_train.columns,
    "Imp": model.feature_importances_})
imp_df.sort_values(by="Imp", ascending=False)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
cols = ['Song Name', 'Artist']
sp[cols] = sp[cols].apply(LabelEncoder().fit_transform)

train, val = train_test_split(sp, test_size=0.2, random_state=17)
features = sp.drop(columns=[target]).columns
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [None]:

! pip install category_encoders
from category_encoders import OrdinalEncoder, TargetEncoder
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV, validation_curve

X_train = X_train.drop(['Song Name'], axis=1)
X_val = X_val.drop(['Song Name'], axis=1)

In [None]:

reg = LogisticRegression()
reg.fit(X_train,y_train)

y_reg = reg.predict(X_val)
print(confusion_matrix(y_val,y_reg))
print(classification_report(y_val, y_reg))

In [None]:
rf = ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_val)
print(confusion_matrix(y_val,y_rf))
print(classification_report(y_val, y_rf))

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_xgb = xgb.predict(X_val)
print(confusion_matrix(y_val,y_xgb))
print(classification_report(y_val, y_xgb))

In [None]:
modelX = XGBClassifier(
    n_estimators=1000,  # <= 1000 트리로 설정했지만, early stopping 에 따라 조절됩니다.
    max_depth=7,        # default=3, high cardinality 특성을 위해 기본보다 높여 보았습니다.
    learning_rate=0.2,
#     scale_pos_weight=ratio, # imbalance 데이터 일 경우 비율을 적용합니다.
    n_jobs=-1
)

eval_set = [(X_train, y_train), 
            (X_val, y_val)]

modelX.fit(X_train, y_train, 
          eval_set=eval_set,
          eval_metric='error', # #(wrong cases)/#(all cases)
          early_stopping_rounds=50)

In [None]:
results = modelX.evals_result()
train_error = results['validation_0']['error']
val_error = results['validation_1']['error']

epoch = range(1, len(train_error)+1)
plt.plot(epoch, train_error, label='Train')
plt.plot(epoch, val_error, label='Validation')
plt.ylabel('Classification Error')
plt.xlabel('Model Complexity (n_estimators)')
plt.legend();
y_xgb1 = modelX.predict(X_val)
print(confusion_matrix(y_val,y_xgb1))

print(classification_report(y_val, modelX.predict(X_val)))

In [None]:
# xgboost.plot_importance(modelX)
imp_df1 = pd.DataFrame({
    "ColumnName": X_train.columns,
    "Imp": modelX.feature_importances_})
imp_df1.sort_values(by="Imp", ascending=False)

In [None]:
X_test = test_s[features]

y_test = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, -1]
X_test['Maybe You Like...(%)'] = y_test_proba * 100
X_test = X_test.sort_values('Maybe You Like...(%)', ascending=False)
X_test = X_test.reset_index()

In [None]:
features_t = ['index', 'Artist', 'Song Name', 'Maybe You Like...(%)']
recommends = X_test[features_t]
recommends = recommends.drop('index', axis=1)
recommends.head(30)