In [None]:
import numpy as np
import pandas as pd
import sqlalchemy
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import timeit
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

import xgboost as xgb

%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (20, 10)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [None]:
engine = create_engine('postgresql://user:PASSWORD@localhost:5432/project3')

In [None]:
query = 'SELECT * FROM allgenres;'
        
pd.read_sql(query, engine)

In [None]:
query = 'SELECT genre, COUNT(*) FROM allgenres GROUP BY genre;'
        
pd.read_sql(query, engine)

In [None]:
query = 'SELECT AVG(energy) as "Energy Avg", genre FROM allgenres GROUP BY genre;'
        
pd.read_sql(query, engine)

In [None]:
#do some EDA with SQL enough to satisfy requirements

In [None]:
#merge everything into one table with joins
query = 'SELECT * FROM allgenres;'
        
df_main = pd.read_sql(query, engine)

In [None]:
df_main

In [None]:
genres_list = df_main.genre.unique()

## CLEANING / EDA ## 

In [None]:
df_main.shape

In [None]:
df_main.dtypes

In [None]:
df_main['genre'] = df_main['genre'].astype(str)

In [None]:
df_numeric = df_main[['length', 'popularity', 'danceability', 'energy', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
                     'valence', 'tempo']]

In [None]:
df_main.isnull().values.any()

In [None]:
sns.boxplot(x="variable", y="value", data=pd.melt(df_numeric))

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
df_numeric

In [None]:
numeric_columns = df_numeric.columns

In [None]:
scaler = StandardScaler()
df_numeric_scaled = scaler.fit_transform((df_numeric))

In [None]:
df_numeric_scaled

In [None]:
df_numeric_scaled = pd.DataFrame(df_numeric_scaled)

In [None]:
df_numeric_scaled

In [None]:
df_numeric_scaled.columns = numeric_columns

In [None]:
df_numeric_scaled['genre'] = df_main['genre']

In [None]:
df_numeric_scaled

In [None]:
fig, ax = plt.subplots(figsize=(15,  10))
df_numeric_scaled.boxplot(ax=ax)

In [None]:
for i, col in enumerate(df_numeric_scaled.columns):
    plt.figure(i)
    sns.distplot(df_numeric_scaled[col])

In [None]:
genres_list 

In [None]:
#plotting the best charts
classical_df = df_numeric_scaled[(df_numeric_scaled.genre=='classical')]
country_df = df_numeric_scaled[(df_numeric_scaled.genre=='country')]
electronic_df = df_numeric_scaled[(df_numeric_scaled.genre=='electronic')]
hiphop_df = df_numeric_scaled[(df_numeric_scaled.genre=='hiphop')]
jazz_df = df_numeric_scaled[(df_numeric_scaled.genre=='jazz')]
latin_df = df_numeric_scaled[(df_numeric_scaled.genre=='latin')]
pop_df = df_numeric_scaled[(df_numeric_scaled.genre=='pop')]
rock_df = df_numeric_scaled[(df_numeric_scaled.genre=='rock')]

In [None]:
def get_dist_plot(feature):
    plt.figure(figsize=(12,10))
    sns.distplot(classical_df[feature], label='classical', hist=False, kde_kws={"shade": True})
    sns.distplot(country_df[feature], label='country', hist=False, kde_kws={"shade": True})
    sns.distplot(electronic_df[feature], label='electronic', hist=False, kde_kws={"shade": True})
    sns.distplot(hiphop_df[feature], label='hiphop', hist=False, kde_kws={"shade": True})
    sns.distplot(jazz_df[feature], label='jazz', hist=False, kde_kws={"shade": True})
    sns.distplot(latin_df[feature], label='latin', hist=False, kde_kws={"shade": True})
    sns.distplot(pop_df[feature], label='pop', hist=False, kde_kws={"shade": True})
    sns.distplot(rock_df[feature], label='rock', hist=False, kde_kws={"shade": True})
    plt.legend(loc='upper right', fontsize=17)
    plt.title(feature.capitalize(), size=32)

In [None]:
df_numeric_scaled.columns

In [None]:
get_dist_plot('length')

In [None]:
get_dist_plot('popularity')

In [None]:
get_dist_plot('danceability')
plt.title('Custom Data Set', size=32)
plt.savefig('new_data_dance')

In [None]:
get_dist_plot('energy')

In [None]:
get_dist_plot('loudness')

In [None]:
df_numeric_scaled.columns
get_dist_plot('speechiness')

In [None]:
df_numeric_scaled.columns
get_dist_plot('acousticness')

In [None]:
df_numeric_scaled.columns
get_dist_plot('instrumentalness')

In [None]:
df_numeric_scaled.columns
get_dist_plot('liveness')

In [None]:
df_numeric_scaled.columns
get_dist_plot('valence')

In [None]:
df_numeric_scaled.columns
get_dist_plot('tempo')

In [None]:
sns.pairplot(df_numeric_scaled, hue='genre')

## Doing MODELING Things ##

In [None]:
X = df_numeric_scaled.loc[:, df_numeric_scaled.columns != 'genre']
y = df_numeric_scaled.loc[:, 'genre']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=998)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=777)

In [None]:
#test knn
knn = KNeighborsClassifier(n_neighbors=10)
knn_start = time.time()
knn.fit(X_train, y_train['genre'])
stop = time.time()
print("Training: {:6.2f}%".format(100*knn.score(X_train, y_train['genre'])))
print("Validation set: {:6.2f}%".format(100*knn.score(X_val, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], knn.predict_proba(X_val), multi_class="ovr")))
print(f"Training time: {stop - start}s")

In [None]:
#test logistic
logit = LogisticRegression()
start = time.time()
logit.fit(X_train, y_train['genre'])
stop = time.time()
print("Training: {:6.2f}%".format(100*logit.score(X_train, y_train['genre'])))
print("Test set: {:6.2f}%".format(100*logit.score(X_val, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], logit.predict_proba(X_val), multi_class="ovr")))
print(f"Training time: {stop - start}s")

In [None]:
#test naive bayes
nb = GaussianNB()
start = time.time()
nb.fit(X_train, y_train['genre'])
stop = time.time()
print("Training: {:6.2f}%".format(100*nb.score(X_train, y_train['genre'])))
print("Validation set: {:6.2f}%".format(100*nb.score(X_val, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], nb.predict_proba(X_val), multi_class="ovr")))
print(f"Training time: {stop - start}s")

In [None]:
# test randomforest
rf = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=888)
start = time.time()
rf.fit(X_train, y_train['genre'])
stop = time.time()
print("Training: {:6.2f}%".format(100*rf.score(X_train, y_train['genre'])))
print("Validation set: `{:6.2f}%".format(100*rf.score(X_val, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], logit.predict_proba(X_val), multi_class="ovr")))
print(f"Training time: {stop - start}s")

In [None]:
#convert target labels to ints for XGB -> i dont think I have to do this??

y_train = pd.DataFrame(data= y_train)
y_val = pd.DataFrame(data= y_val)
y_test = pd.DataFrame(data= y_test)

le = preprocessing.LabelEncoder()

le.fit(y_train)
y_train['categorical_label'] = le.transform(y_train)
y_val['categorical_label'] = le.transform(y_val)
y_test['categorical_label'] = le.transform(y_test)


In [None]:
bst = xgb.XGBClassifier(
        base_score=0.5, 
        booster='gbtree', 
        colsample_bylevel=1,
        colsample_bynode=1, 
        colsample_bytree=0.5,
        eval_metric='merror',
        gamma=0, 
        gpu_id=-1, 
        importance_type='gain',
        interaction_constraints='', 
        learning_rate=0.1, 
        max_delta_step=0,
        max_depth=5, 
        min_child_weight=3, 
        monotone_constraints='()', 
        n_estimators=10000,
        n_jobs=8,
        num_class=14, 
        num_parallel_tree=1, 
        objective='multi:softmax',
        random_state=0,
        reg_alpha=0, 
        reg_lambda=1, 
        scale_pos_weight=None,
        subsample=0.8, 
        tree_method='exact', 
        use_label_encoder=False,
        validate_parameters=1, 
        verbosity=None
)

eval_set=[(X_train, y_train['categorical_label']),(X_val, y_val['categorical_label'])]

start = time.time()

fit_xgb = bst.fit(
            X_train, y_train['categorical_label'],
            eval_set=eval_set,
            eval_metric='merror',
            early_stopping_rounds=30,
            verbose=False)

stop = time.time()

print(accuracy_score(y_test['categorical_label'], bst.predict(X_test, ntree_limit=bst.best_ntree_limit)))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], logit.predict_proba(X_val), multi_class="ovr")))
print(f"Training time: {stop - start}s")

## REFINE Random Forest ##

In [None]:
# feature reduce or engineer?

In [None]:
X_train['genre'] = y_train['genre']
X_train

In [None]:
X_train = X_train.drop(columns=['genre'])

In [None]:
X_train

In [None]:
#polynomial feature transform
poly = PolynomialFeatures(interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_train_poly

In [None]:
X_val_poly = poly.fit_transform(X_val)
X_val_poly

In [None]:
X_val_poly = pd.DataFrame(X_val_poly, columns = poly_features)
X_val_poly

In [None]:
#re-run
rf = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=888)
rf.fit(X_train_poly, y_train['genre'])
print("Training: {:6.2f}%".format(100*rf.score(X_train_poly, y_train['genre'])))
print("Validation set: `{:6.2f}%".format(100*rf.score(X_val_poly, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], rf.predict_proba(X_val_poly), multi_class="ovr")))

In [None]:
#feature important and reduce
importances = list(zip(rf.feature_importances_, X_train_poly.columns))
importances.sort(reverse=True)
importances

In [None]:
top_20_features = [ x[1] for x in importances[:20]]

In [None]:
#reduce
X_train_slim = X_train_poly[top_20_features]

In [None]:
X_val_slim = X_val_poly[top_20_features]

In [None]:
# tune parameters
print('Parameters currently in use:\n')
print(rf.get_params())

In [None]:
y_train

In [None]:
#num of trees
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 5000, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 50)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 5, verbose=2, random_state=424)

rf_random.fit(X_train_slim, y_train['genre'])

In [None]:
rf_random.best_params_

In [None]:
random_search_best = rf_random.best_estimator_

In [None]:
#re-run on thinned features
random_search_best.fit(X_train_slim, y_train['genre'])
print("Training: {:6.2f}%".format(100*random_search_best.score(X_train_slim, y_train['genre'])))
print("Validation set: `{:6.2f}%".format(100*random_search_best.score(X_val_slim, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], random_search_best.predict_proba(X_val_slim), multi_class="ovr")))

In [None]:
#re-run new model on all poly features
random_search_best.fit(X_train_poly, y_train['genre'])
print("Training: {:6.2f}%".format(100*random_search_best.score(X_train_poly, y_train['genre'])))
print("Validation set: `{:6.2f}%".format(100*random_search_best.score(X_val_poly, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], random_search_best.predict_proba(X_val_poly), multi_class="ovr")))

In [None]:
#re run new model on original features
random_search_best.fit(X_train, y_train['genre'])
print("Training: {:6.2f}%".format(100*random_search_best.score(X_train, y_train['genre'])))
print("Validation set: `{:6.2f}%".format(100*random_search_best.score(X_val, y_val['genre'])))
print("ROC AUC: {:6.2f}%".format(roc_auc_score(y_val['genre'], random_search_best.predict_proba(X_val), multi_class="ovr")))

In [None]:
#re run new model on test set features
print("Test set: `{:6.2f}%".format(100*random_search_best.score(X_test, y_test['genre'])))

In [None]:
rf_final_importances = list(zip(random_search_best.feature_importances_, X_train.columns))
rf_final_importances.sort(reverse=True)
rf_final_importances

In [None]:
plot_confusion_matrix(random_search_best, X_test, y_test['genre'], cmap=plt.cm.Blues)
plt.tight_layout()
plt.title('Actual vs. Predicted')
plt.grid(False)
plt.savefig("randomforest_conf_mat.png")