In [350]:
import numpy as np
import pandas as pd
# import pandas_profiling as pp
import os
import re
import string
import pickle
import tarfile
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import Pipeline

pd.set_option('display.max_rows', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
s3_location = 's3://djk-ml-sagemaker/music_lyrics/'

In [351]:
final_df = pd.read_csv(f'{s3_location}cleaned_lemmatized_unstopped_df.csv')

In [352]:
final_df.liked.value_counts(dropna = False)

NaN    38071
0.0     1656
1.0     1639
Name: liked, dtype: int64

In [353]:
# final_df.profile_report()

In [354]:
non_null_df = final_df[final_df['liked'].notnull()].copy()

In [355]:
non_null_df.shape

(3295, 5)

In [356]:
null_df = final_df[final_df['liked'].isnull()].copy()

In [357]:
X, y = non_null_df['cleaned_lyrics'], non_null_df['liked']

In [358]:
train_value_counts = y.value_counts()

train_value_counts

majority_count = train_value_counts[train_value_counts.index == 0].iloc[0]
target_count = train_value_counts[train_value_counts.index == 1].iloc[0]

majority_ratio = (majority_count)/(majority_count+target_count)

0.0    1656
1.0    1639
Name: liked, dtype: int64

In [359]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

In [360]:
# preprocessing steps

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
variance_filter = VarianceThreshold(.0005)

word_features = ['cleaned_lyrics']

steps = [
#     ('count_vect', count_vect_no_stops),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('variance_filter', variance_filter) # removes low variance columns from dataset
]

word_transformer = Pipeline(steps)

# gridsearch params/pipeline

In [361]:
word_transformer.fit_transform(X_train)

<2636x397 sparse matrix of type '<class 'numpy.float64'>'
	with 65624 stored elements in Compressed Sparse Row format>

In [362]:
all_columns = word_transformer.named_steps['tfidf_vectorizer'].get_feature_names()
filtered_columns = word_transformer.named_steps['variance_filter'].get_support(indices = True)

vectorized_X_train = pd.DataFrame(
    word_transformer.fit_transform(X_train).toarray(),
    columns = [all_columns[i] for i in filtered_columns]
)

In [363]:
vectorized_X_train.head()
vectorized_X_train.shape

Unnamed: 0,act,ah,air,alive,alright,angel,animal,answer,anybody,arm,...,world,worry,wrong,ya,yeah,year,yes,yesterday,yo,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.052665,0.0,0.0,0.0,0.0,0.088649,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.056204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.260352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(2636, 397)

In [364]:
def model_selection(X_train, y_train, list_of_models:list):
    '''
    this functions takes in a list of classification algorithms with default hyperparam settings
    so we can find the ones that are the most promising
    '''
    
    roc_auc_list = []
    
    importances = []
    
    for model in list_of_models:
        model.fit(X_train, y_train)
        
        feature_importance_dict = {}
        
        try:
            for feature, importance in zip(X_train.columns, model.feature_importances_):
                feature_importance_dict[feature] = importance
        except:
            pass
            
        importance_df = pd.DataFrame.from_dict(feature_importance_dict, orient = 'index')
        
        importances.append(importance_df)
        
        quick_roc_auc = cross_val_score(
            model,
            X_train,
            y_train,
            scoring = 'roc_auc',
            cv = StratifiedKFold(10)
        )
        
        roc_auc_list.append(np.mean(quick_roc_auc))
    
    return roc_auc_list, importances

In [365]:
list_of_models = [
    MultinomialNB(),
    LogisticRegression(random_state = 42),
    SVC(random_state = 42),
    RandomForestClassifier(random_state = 42),
    KNeighborsClassifier(),
    XGBClassifier(random_state = 42)
]

In [366]:
model_selection_roc_aucs, importances = model_selection(vectorized_X_train, y_train, list_of_models)



In [367]:
model_selection_roc_aucs
importances

[0.5550139538605836,
 0.560144619765246,
 0.5564328412653087,
 0.5729444748089285,
 0.5283464785723588,
 0.566533049431667]

[Empty DataFrame
 Columns: []
 Index: [], Empty DataFrame
 Columns: []
 Index: [], Empty DataFrame
 Columns: []
 Index: [],                      0
 act           0.001949
 ah            0.003459
 air           0.002539
 alive         0.002364
 alright       0.002611
 angel         0.000740
 animal        0.000413
 answer        0.001489
 anybody       0.000367
 arm           0.001253
 ask           0.001910
 away          0.007598
 babe          0.000399
 baby          0.008308
 bad           0.004834
 bang          0.000748
 bass          0.001458
 bear          0.002915
 beat          0.002257
 beautiful     0.000745
 begin         0.000918
 believe       0.003660
 belong        0.001347
 better        0.001489
 big           0.002373
 bit           0.000130
 bitch         0.002005
 black         0.002309
 blind         0.001557
 blood         0.002692
 blow          0.004048
 blue          0.001678
 body          0.002070
 boy           0.005189
 break         0.002693
 breathe     

In [368]:
pd.set_option('display.max_rows', None)

random_forest_top_features = importances[3][0].sort_values(ascending = False)[:50]
xgb_top_features = importances[5][0].sort_values(ascending = False)[:50]

In [369]:
random_forest_top_features
xgb_top_features

know       0.018959
yeah       0.012899
want       0.012732
love       0.011587
come       0.011437
oh         0.011412
like       0.009961
feel       0.009281
hear       0.008971
let        0.008537
thing      0.008474
baby       0.008308
day        0.008283
right      0.008229
time       0.008147
need       0.008093
lose       0.007978
say        0.007851
think      0.007795
look       0.007690
tell       0.007649
away       0.007598
way        0.007109
hey        0.006971
hand       0.006713
light      0.006707
night      0.006460
long       0.006427
man        0.006370
eye        0.006341
throw      0.006247
girl       0.006175
face       0.006098
live       0.006054
start      0.005881
leave      0.005861
home       0.005834
mind       0.005726
cause      0.005723
dream      0.005719
try        0.005667
true       0.005600
good       0.005573
walk       0.005356
turn       0.005258
stay       0.005191
boy        0.005189
heart      0.005182
tonight    0.005176
whoa       0.005137


town        0.016324
money       0.012873
yes         0.012717
hear        0.012581
right       0.011584
arm         0.011488
damn        0.011459
kick        0.011148
big         0.010778
hate        0.010303
scream      0.010198
room        0.010052
run         0.010035
walk        0.009701
whoa        0.009527
hey         0.009510
rock        0.009500
music       0.009449
away        0.009367
soon        0.009252
home        0.009207
hide        0.009176
sweet       0.009169
feeling     0.009120
boy         0.009044
ride        0.008982
remember    0.008881
nice        0.008874
happy       0.008803
til         0.008636
yeah        0.008512
come        0.008330
leave       0.008306
late        0.008288
let         0.008269
man         0.008242
say         0.008121
old         0.008057
free        0.008046
good        0.007717
black       0.007675
sit         0.007651
lie         0.007615
longer      0.007560
sky         0.007490
true        0.007417
die         0.007343
wave        0

In [370]:
top_feature_overlap = set(random_forest_top_features.index) & set(xgb_top_features.index)

top_feature_overlap

len(top_feature_overlap)/50

{'away',
 'boy',
 'come',
 'good',
 'hear',
 'hey',
 'home',
 'leave',
 'let',
 'man',
 'right',
 'say',
 'true',
 'walk',
 'whoa',
 'yeah'}

0.32

#### there is a 32% overlap in the top 50 features for random forest and xgboost

In [371]:
vectorized_X_train.to_csv('data/vectorized_train.csv')

#### taking the top 2 performing models and optimizing hyperparameters for them

In [372]:
def find_best_model(model_hyperparam_dict, X_train, y_train):
    '''
    this function finds the best model.
    takes in a dictionary with multiple algorithms and hyperparameters
    returns the best model based on roc_auc cross val scores
    also leaves a file with best params
    '''
    
    keys = [k for k, v in model_hyperparam_dict.items()]
    
    cv_results = []
    best_scores = []
    best_estimators = []
    classification_reports = []
    
    for key in keys:
        
        model = model_hyperparam_dict[key]['model']
        hyperparams = model_hyperparam_dict[key]['hyperparams']
        
        print(f'randomsearching {key}...')
        print(f'using hyperparams: \n{hyperparams}')
        
        grid = RandomizedSearchCV(
            estimator = model,
            param_distributions = hyperparams,
            n_iter = 100,
            scoring = 'roc_auc',
            n_jobs = -1,
            cv = StratifiedKFold(10, random_state = 42),
            verbose = 2
        )
        
        grid.fit(X_train, y_train)
        
        cv_results.append(grid.cv_results_)
        best_scores.append(grid.best_score_)
        best_estimators.append(grid.best_estimator_)
        
    return cv_results, best_scores, best_estimators

In [373]:
random_forest_hyperparams = {
    'bootstrap':[True, False],
    'max_depth':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features':['auto', 'sqrt'],
    'min_samples_leaf':[1, 2, 4],
    'min_samples_split':[2, 5, 10],
    'n_estimators':[100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

xgb_hyperparams = {
    'max_depth':range(1, 15),
    'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators':[100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'gamma':[0, .01, .1, 1],
    'min_child_weight':[1, 3, 5, 7],
    'colsample_bytree':[.3, .4, .5, .6]
}

In [374]:
model_hyperparam_dict = {
    'random_forest' : {
        'model': RandomForestClassifier(random_state = 42),
        'hyperparams' : random_forest_hyperparams
    },
    
    'xgboost' : {
        'model':XGBClassifier(random_state = 42),
        'hyperparams' : xgb_hyperparams
    }
    
}

In [375]:
cv_results, best_scores, best_estimators = find_best_model(model_hyperparam_dict, vectorized_X_train, y_train)

randomsearching random_forest...
using hyperparams: 
{'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 21.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 68.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 107.5min finished


randomsearching xgboost...
using hyperparams: 
{'max_depth': range(1, 15), 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'gamma': [0, 0.01, 0.1, 1], 'min_child_weight': [1, 3, 5, 7], 'colsample_bytree': [0.3, 0.4, 0.5, 0.6]}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 47.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 121.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 210.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 345.9min finished


In [376]:
best_scores
best_estimators

[0.5928416427745494, 0.584827035044296]

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=2, min_samples_split=10,
                        min_weight_fraction_leaf=0.0, n_estimators=1200,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.4, gamma=0,
               learning_rate=0.001, max_delta_step=0, max_depth=13,
               min_child_weight=1, missing=None, n_estimators=1200, n_jobs=1,
               nthread=None, objective='binary:logistic', random_state=42,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
               silent=None, subsample=1, verbosity=1)]

In [377]:
best_random_forest = best_estimators[0]

In [379]:
best_xgb = best_estimators[1]

In [380]:
second_pipeline_steps = [
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('variance_filter', variance_filter), # removes low variance columns from dataset
    ('classifier', best_random_forest)
]

In [381]:
third_pipeline_steps = [
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('variance_filter', variance_filter), # removes low variance columns from dataset
    ('classifier', best_xgb)
]

In [382]:
final_random_forest_model = Pipeline(second_pipeline_steps)

In [383]:
final_random_forest_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf_vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0

In [384]:
final_xgb_model = Pipeline(third_pipeline_steps)
final_xgb_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf_vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.4, gamma=0,
                               learning_rate=0.001, max_delta_step=0,
                               max_depth=13, min_child_weight=1, mi

In [385]:
random_forest_preds = final_random_forest_model.predict(X_test)
random_forest_probas = final_random_forest_model.predict_proba(X_test)

In [386]:
xgb_preds = final_xgb_model.predict(X_test)
xgb_probas = final_xgb_model.predict_proba(X_test)

In [387]:
roc_auc_score(y_test, random_forest_preds)

0.536724961720811

In [388]:
print(classification_report(y_test, random_forest_preds))

              precision    recall  f1-score   support

         0.0       0.55      0.63      0.59       342
         1.0       0.53      0.44      0.48       317

    accuracy                           0.54       659
   macro avg       0.54      0.54      0.53       659
weighted avg       0.54      0.54      0.54       659



In [389]:
pd.Series([l[1] for l in probas]).value_counts(bins = 20).sort_index()

(0.129, 0.167]      2
(0.167, 0.203]      1
(0.203, 0.239]      1
(0.239, 0.275]      5
(0.275, 0.311]      8
(0.311, 0.347]     18
(0.347, 0.383]     44
(0.383, 0.419]     66
(0.419, 0.455]     76
(0.455, 0.491]    111
(0.491, 0.527]     89
(0.527, 0.563]     63
(0.563, 0.599]     43
(0.599, 0.635]     34
(0.635, 0.671]     46
(0.671, 0.707]     25
(0.707, 0.743]     12
(0.743, 0.779]      5
(0.779, 0.815]      7
(0.815, 0.851]      3
dtype: int64

In [390]:
roc_auc_score(y_test, xgb_preds)

0.5618047484642205

In [391]:
print(classification_report(y_test, xgb_preds))

              precision    recall  f1-score   support

         0.0       0.58      0.55      0.57       342
         1.0       0.54      0.57      0.56       317

    accuracy                           0.56       659
   macro avg       0.56      0.56      0.56       659
weighted avg       0.56      0.56      0.56       659



In [392]:
pd.Series([l[1] for l in xgb_probas]).value_counts(bins = 20).sort_index()

(0.291, 0.311]     1
(0.311, 0.329]     3
(0.329, 0.347]     1
(0.347, 0.366]     5
(0.366, 0.384]    14
(0.384, 0.402]    14
(0.402, 0.42]     24
(0.42, 0.439]     31
(0.439, 0.457]    52
(0.457, 0.475]    66
(0.475, 0.493]    90
(0.493, 0.512]    62
(0.512, 0.53]     86
(0.53, 0.548]     51
(0.548, 0.566]    36
(0.566, 0.585]    88
(0.585, 0.603]    16
(0.603, 0.621]    12
(0.621, 0.639]     3
(0.639, 0.658]     4
dtype: int64

In [393]:
pickle.dump(final_xgb_model, open('xgb_model.sav', 'wb'))

In [394]:
tar = tarfile.open("xgb_model.tar.gz", "w:gz")
tar.add('xgb_model.sav')

In [43]:
vectorized_recommendations = word_transformer.transform(null_df['cleaned_lyrics'])

In [44]:
null_df.shape

(38071, 6)

In [50]:
null_df['recommendation'] = [l[1] for l in nb.predict_proba(vectorized_recommendations)]

In [51]:
null_df.to_csv('first_recommendations.csv', index = False)

In [137]:
null_df.head()

Unnamed: 0,lyrics,song_title,artist_name,liked,cleaned_lyrics,playlist
3,\n\n[Verse 1]\nI can't remember\nThe words wer...,If You Want It,Jay Som,,remember word form mouth have find bring joy p...,1.0
4,\n\n[Verse 1]\nI'm not that kind of fool\nWho ...,Superbike,Jay Som,,kind fool need read room somebody tell fall li...,1.0
5,\n\n[Verse 1]\nPoint me to my chair\nMake me s...,Peace Out,Jay Som,,point chair sing awful song bear go hard hard ...,1.0
6,\n\n[Verse 1]\nUsed to be the one to cry\nAnd ...,Devotion,Jay Som,,cry feel emotion need path find strange devoti...,1.0
7,\n\n[Verse 1]\nI'm sinking in my bed\nWe’re le...,Nighttime Drive,Jay Som,,sink bed be leave town tomorrow memory feel nu...,0.0


In [141]:
null_df_count_vect = count_vect_no_stops.transform(null_df['cleaned_lyrics'])
null_df_tfidf = tfidf_transformer.transform(null_df_count_vect)
music_to_listen = sgd_ns.predict_proba(null_df_tfidf)

In [142]:
null_df['playlist'] = music_to_listen

In [147]:
len(null_df[null_df['playlist'] > .5])
len(null_df[null_df['playlist'] < .5])

21744

16327