In [37]:
import numpy as np
import pandas as pd
# import pandas_profiling as pp
import os
import re
import boto3
import string
import pickle
import tarfile
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import Pipeline

pd.set_option('display.max_rows', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### download data from s3

In [38]:
s3_location = 's3://djk-ml-sagemaker/music_lyrics/'

In [39]:
final_df = pd.read_csv(f'{s3_location}cleaned_lemmatized_unstopped_df.csv')

In [40]:
final_df.head()

Unnamed: 0,lyrics,song_title,artist_name,liked,cleaned_lyrics
0,\n\n[Verse 1]\nI can't remember\nThe words wer...,If You Want It,Jay Som,,remember word form mouth have find bring joy p...
1,\n\n[Verse 1]\nI'm not that kind of fool\nWho ...,Superbike,Jay Som,,kind fool need read room somebody tell fall li...
2,\n\n[Verse 1]\nPoint me to my chair\nMake me s...,Peace Out,Jay Som,,point chair sing awful song bear go hard hard ...
3,\n\n[Verse 1]\nUsed to be the one to cry\nAnd ...,Devotion,Jay Som,,cry feel emotion need path find strange devoti...
4,\n\n[Verse 1]\nI'm sinking in my bed\nWe’re le...,Nighttime Drive,Jay Som,,sink bed be leave town tomorrow memory feel nu...


In [41]:
final_df.liked.value_counts(dropna = False)

NaN    38071
0.0     1656
1.0     1639
Name: liked, dtype: int64

In [42]:
# final_df.profile_report()

### creating training/testing datasets, only choosing liked/not liked songs, we will apply the finalized model to unrated music to get recommendations

In [43]:
non_null_df = final_df[final_df['liked'].notnull()].copy()

In [44]:
non_null_df.shape

(3295, 5)

In [45]:
# this will be the pool of songs that will be recommended to us

null_df = final_df[final_df['liked'].isnull()].copy()

In [46]:
X, y = non_null_df['cleaned_lyrics'], non_null_df['liked']

In [47]:
train_value_counts = y.value_counts()

train_value_counts

majority_count = train_value_counts[train_value_counts.index == 0].iloc[0]
target_count = train_value_counts[train_value_counts.index == 1].iloc[0]

majority_ratio = (majority_count)/(majority_count+target_count)

0.0    1656
1.0    1639
Name: liked, dtype: int64

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

### saving training/testing to s3

In [49]:
def write_to_s3(filename, bucket, key):
    '''
    this function uploads files from local directory to s3
    '''
    
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [50]:
# defining s3 bucket/key paths

bucket_name = 'djk-ml-sagemaker'

data_folder = 'music_lyrics'

s3_training_location = f's3://{data_folder}'

In [51]:
train = pd.concat([y_train, X_train], axis = 1)
test = pd.concat([y_test, X_test], axis = 1)

In [52]:
train.to_csv('train.csv', index = None)
test.to_csv('test.csv', index = None)

In [53]:
null_df.to_csv('eligible_song_pool.csv', index = None)

In [54]:
write_to_s3('train.csv', bucket_name, 'music_lyrics/train.csv')

In [55]:
write_to_s3('test.csv', bucket_name, 'music_lyrics/test.csv')

In [56]:
write_to_s3('eligible_song_pool.csv', bucket_name, 'music_lyrics/eligible_song_pool.csv')

### building preprocessing pipeline

In [57]:
# preprocessing steps

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
variance_filter = VarianceThreshold(.0005)

word_features = ['cleaned_lyrics']

steps = [
#     ('count_vect', count_vect_no_stops),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('variance_filter', variance_filter) # removes low variance columns from dataset
]

word_transformer = Pipeline(steps)

# gridsearch params/pipeline

In [58]:
word_transformer.fit_transform(X_train)

<2636x397 sparse matrix of type '<class 'numpy.float64'>'
	with 65624 stored elements in Compressed Sparse Row format>

In [59]:
all_columns = word_transformer.named_steps['tfidf_vectorizer'].get_feature_names()
filtered_columns = word_transformer.named_steps['variance_filter'].get_support(indices = True)

vectorized_X_train = pd.DataFrame(
    word_transformer.fit_transform(X_train).toarray(),
    columns = [all_columns[i] for i in filtered_columns]
)

In [60]:
vectorized_X_train.head()
vectorized_X_train.shape

Unnamed: 0,act,ah,air,alive,alright,angel,animal,answer,anybody,arm,...,world,worry,wrong,ya,yeah,year,yes,yesterday,yo,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.052665,0.0,0.0,0.0,0.0,0.088649,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.056204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.260352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(2636, 397)

### quick model selection. Choosing a handful of classificaiton algorithms with default params to shortlist the promising ones

In [61]:
def model_selection(X_train, y_train, list_of_models:list):
    '''
    this functions takes in a list of classification algorithms with default hyperparam settings
    so we can find the ones that are the most promising
    '''
    
    roc_auc_list = []
    
    importances = []
    
    for model in list_of_models:
        model.fit(X_train, y_train)
        
        feature_importance_dict = {}
        
        try:
            for feature, importance in zip(X_train.columns, model.feature_importances_):
                feature_importance_dict[feature] = importance
        except:
            pass
            
        importance_df = pd.DataFrame.from_dict(feature_importance_dict, orient = 'index')
        
        importances.append(importance_df)
        
        quick_roc_auc = cross_val_score(
            model,
            X_train,
            y_train,
            scoring = 'roc_auc',
            cv = StratifiedKFold(10)
        )
        
        roc_auc_list.append(np.mean(quick_roc_auc))
    
    return roc_auc_list, importances

In [62]:
list_of_models = [
    MultinomialNB(),
    LogisticRegression(random_state = 42),
    SVC(random_state = 42),
    RandomForestClassifier(random_state = 42),
    KNeighborsClassifier(),
    XGBClassifier(random_state = 42)
]

In [63]:
model_selection_roc_aucs, importances = model_selection(vectorized_X_train, y_train, list_of_models)

### evaluating promising models

In [64]:
model_selection_roc_aucs

[0.555458191070225,
 0.5602956414033106,
 0.5624070205440737,
 0.5780081443356673,
 0.5256359218262046,
 0.5659890646616331]

In [65]:
pd.set_option('display.max_rows', None)

random_forest_top_features = importances[3][0].sort_values(ascending = False)[:50]
xgb_top_features = importances[5][0].sort_values(ascending = False)[:50]

### random forest and xgboost are coming out ahead, looking at feature importances

In [None]:
X_train.head()

In [66]:
random_forest_top_features[:20]
xgb_top_features[:20]

know     0.018697
come     0.012762
love     0.012172
want     0.011419
yeah     0.011078
like     0.010559
right    0.010087
oh       0.009537
time     0.009240
feel     0.008529
let      0.008474
look     0.008187
good     0.008077
think    0.008065
hear     0.007639
tell     0.007439
say      0.007327
away     0.007022
leave    0.006989
way      0.006974
Name: 0, dtype: float64

town      0.016324
money     0.012873
yes       0.012717
hear      0.012581
right     0.011584
arm       0.011488
damn      0.011459
kick      0.011148
big       0.010778
hate      0.010303
scream    0.010198
room      0.010052
run       0.010035
walk      0.009701
whoa      0.009527
hey       0.009510
rock      0.009500
music     0.009449
away      0.009367
soon      0.009252
Name: 0, dtype: float64

In [67]:
top_feature_overlap = set(random_forest_top_features.index) & set(xgb_top_features.index)

top_feature_overlap

len(top_feature_overlap)/50

{'away',
 'boy',
 'come',
 'feeling',
 'good',
 'hear',
 'hey',
 'home',
 'leave',
 'let',
 'make',
 'man',
 'right',
 'say',
 'true',
 'walk',
 'yeah'}

0.34

#### there is a 32% overlap in the top 50 features for random forest and xgboost

In [68]:
vectorized_X_train.to_csv('vectorized_train.csv')

### taking the top 2 performing models (random forest and xgboost) and optimizing hyperparameters (randomizedsearch) for them

In [69]:
def find_best_model(model_hyperparam_dict, X_train, y_train):
    '''
    this function finds the best model.
    takes in a dictionary with multiple algorithms and hyperparameters
    returns the best model based on roc_auc cross val scores
    also leaves a file with best params
    '''
    
    keys = [k for k, v in model_hyperparam_dict.items()]
    
    cv_results = []
    best_scores = []
    best_estimators = []
    classification_reports = []
    
    for key in keys:
        
        model = model_hyperparam_dict[key]['model']
        hyperparams = model_hyperparam_dict[key]['hyperparams']
        
        print(f'randomsearching {key}...')
        print(f'using hyperparams: \n{hyperparams}')
        
        grid = RandomizedSearchCV(
            estimator = model,
            param_distributions = hyperparams,
            n_iter = 100,
            scoring = 'roc_auc',
            cv = StratifiedKFold(10),
            verbose = 1
        )
        
        grid.fit(X_train, y_train)
        
        cv_results.append(grid.cv_results_)
        best_scores.append(grid.best_score_)
        best_estimators.append(grid.best_estimator_)
        
    return cv_results, best_scores, best_estimators

In [70]:
random_forest_hyperparams = {
    'bootstrap':[True, False],
    'max_depth':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features':['auto', 'sqrt'],
    'min_samples_leaf':[1, 2, 4],
    'min_samples_split':[2, 5, 10],
    'n_estimators':[100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

xgb_hyperparams = {
    'max_depth':range(1, 15),
    'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators':[100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'gamma':[0, .01, .1, 1],
    'min_child_weight':[1, 3, 5, 7],
    'colsample_bytree':[.3, .4, .5, .6]
}

In [71]:
model_hyperparam_dict = {
    'random_forest' : {
        'model': RandomForestClassifier(random_state = 42),
        'hyperparams' : random_forest_hyperparams
    },
    
    'xgboost' : {
        'model':XGBClassifier(random_state = 42),
        'hyperparams' : xgb_hyperparams
    }
    
}

In [72]:
cv_results, best_scores, best_estimators = find_best_model(model_hyperparam_dict, vectorized_X_train, y_train)

randomsearching random_forest...
using hyperparams: 
{'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


  raise TypeError("shuffle must be True or False;"
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed: 173.5min finished


randomsearching xgboost...
using hyperparams: 
{'max_depth': range(1, 15), 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'gamma': [0, 0.01, 0.1, 1], 'min_child_weight': [1, 3, 5, 7], 'colsample_bytree': [0.3, 0.4, 0.5, 0.6]}
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


  raise TypeError("shuffle must be True or False;"
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed: 382.1min finished


In [73]:
best_scores
best_estimators

[0.5920927138441018, 0.5845631744848526]

[RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=80, max_features='sqrt',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=2, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=600,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.3, gamma=0.1,
               learning_rate=0.01, max_delta_step=0, max_depth=12,
               min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
               nthread=None, objective='binary:logistic', random_state=42,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
               si

#### appears to perform better than chance, since the training dataset contains about 50% labeled as liked and unliked

### building a pipeline for both using the optimized hyperparams to evaluate on the test set, since our roc_auc crossval score is close

In [74]:
best_random_forest = best_estimators[0]

In [75]:
best_xgb = best_estimators[1]

In [76]:
second_pipeline_steps = [
    ('tfidf_vectorizer', tfidf_vectorizer), # term frequency document infrequency word vectorizer
    ('variance_filter', variance_filter), # removes low variance columns from dataset
    ('classifier', best_random_forest)
]

In [77]:
third_pipeline_steps = [
    ('tfidf_vectorizer', tfidf_vectorizer), # term frequency document infrequency word vectorizer
    ('variance_filter', variance_filter), # removes low variance columns from dataset
    ('classifier', best_xgb)
]

In [78]:
final_random_forest_model = Pipeline(second_pipeline_steps)

In [79]:
final_random_forest_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf_vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False...
                 RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=80, max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                 

In [80]:
final_xgb_model = Pipeline(third_pipeline_steps)
final_xgb_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf_vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.3, gamma=0.1,
                               learning_rate=0.01, max_delta_step=0,
                               max_depth=12, min_child_weight=1, m

In [81]:
random_forest_preds = final_random_forest_model.predict(X_test)
random_forest_probas = final_random_forest_model.predict_proba(X_test)

In [82]:
xgb_preds = final_xgb_model.predict(X_test)
xgb_probas = final_xgb_model.predict_proba(X_test)

### evaluating both models on test set

In [83]:
roc_auc_score(y_test, random_forest_preds)

0.5475722692641172

In [84]:
print(classification_report(y_test, random_forest_preds))

              precision    recall  f1-score   support

         0.0       0.56      0.59      0.58       342
         1.0       0.53      0.50      0.52       317

    accuracy                           0.55       659
   macro avg       0.55      0.55      0.55       659
weighted avg       0.55      0.55      0.55       659



In [89]:
pd.Series([l[1] for l in random_forest_probas]).value_counts(bins = 20).sort_index()

(0.0527, 0.0968]      2
(0.0968, 0.139]       4
(0.139, 0.181]        2
(0.181, 0.224]        3
(0.224, 0.266]        1
(0.266, 0.308]        9
(0.308, 0.35]        30
(0.35, 0.392]        42
(0.392, 0.435]       79
(0.435, 0.477]      119
(0.477, 0.519]      116
(0.519, 0.561]       66
(0.561, 0.604]       50
(0.604, 0.646]       28
(0.646, 0.688]       47
(0.688, 0.73]        28
(0.73, 0.773]        12
(0.773, 0.815]        9
(0.815, 0.857]        5
(0.857, 0.899]        7
dtype: int64

In [90]:
roc_auc_score(y_test, xgb_preds)

0.5701523788440608

In [91]:
print(classification_report(y_test, xgb_preds))

              precision    recall  f1-score   support

         0.0       0.59      0.54      0.57       342
         1.0       0.55      0.60      0.57       317

    accuracy                           0.57       659
   macro avg       0.57      0.57      0.57       659
weighted avg       0.57      0.57      0.57       659



In [92]:
pd.Series([l[1] for l in xgb_probas]).value_counts(bins = 20).sort_index()

(0.231, 0.255]     4
(0.255, 0.278]     0
(0.278, 0.3]       3
(0.3, 0.323]       3
(0.323, 0.346]     7
(0.346, 0.368]    10
(0.368, 0.391]    33
(0.391, 0.414]    31
(0.414, 0.437]    47
(0.437, 0.459]    57
(0.459, 0.482]    59
(0.482, 0.505]    71
(0.505, 0.527]    73
(0.527, 0.55]     63
(0.55, 0.573]     58
(0.573, 0.596]    88
(0.596, 0.618]    28
(0.618, 0.641]    13
(0.641, 0.664]     4
(0.664, 0.686]     7
dtype: int64

### looks like we're losing some % on the eval set for both models, but the xgboost is performing better than randomforest

### saving both models as backups

In [93]:
pickle.dump(final_xgb_model, open('xgb_model.sav', 'wb'))

In [94]:
tar = tarfile.open("xgb_model.tar.gz", "w:gz")
tar.add('xgb_model.sav')

In [95]:
pickle.dump(final_random_forest_model, open('random_forest.sav', 'wb'))

In [96]:
tar = tarfile.open("random_forest.tar.gz", "w:gz")
tar.add('random_forest.sav')