In [54]:
import numpy as np
import pandas as pd
import pandas_profiling as pp
import os
import re
import string
import spacy
import pickle
import tarfile
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

import nltk
nltk.download('words')
nlp = spacy.load('en_core_web_sm')
words = set(nltk.corpus.words.words())

from alphabet_detector import AlphabetDetector
ad = AlphabetDetector()

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# nlp = English()

[nltk_data] Downloading package words to /Users/lucid75/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
final_df = pd.read_csv('cleaned_lemmatized_unstopped_df.csv')

In [5]:
final_df.liked.value_counts(dropna = False)

NaN    38071
0.0     1656
1.0     1639
Name: liked, dtype: int64

In [6]:
final_df.profile_report()



In [7]:
non_null_df = final_df[final_df['liked'].notnull()].copy()

In [8]:
non_null_df.shape

(3295, 5)

In [9]:
null_df = final_df[final_df['liked'].isnull()].copy()

In [10]:
X, y = non_null_df['cleaned_lyrics'], non_null_df['liked']

In [11]:
train_value_counts = y.value_counts()

train_value_counts

majority_count = train_value_counts[train_value_counts.index == 0].iloc[0]
target_count = train_value_counts[train_value_counts.index == 1].iloc[0]

majority_ratio = (majority_count)/(majority_count+target_count)

0.0    1656
1.0    1639
Name: liked, dtype: int64

In [15]:
## messing w/ word vectors

# import en_core_web_sm
# nlp = en_core_web_sm.load()
# mango = nlp(u'mango')
# print(mango.vector.shape)
# print(mango.vector)

# human = nlp('dog')
# human.vector.shape

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

In [17]:
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size = .25, random_state = 42)

In [18]:
# count_vect_no_stops = CountVectorizer(stop_words = 'english')

In [19]:
# X_train_ns_counts = count_vect_no_stops.fit_transform(X_train)
# X_train_ns_counts.shape

In [35]:
# term_importance = pd.DataFrame(data = {
#     'feature_name':count_vect_no_stops.get_feature_names(),
#     'tfidf':tfidf_transformer.idf_
# }).sort_values(by = 'tfidf', ascending = False)

In [21]:
# pipeline steps

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')

word_features = ['cleaned_lyrics']

steps = [
#     ('count_vect', count_vect_no_stops),
         ('tfidf_vectorizer', tfidf_vectorizer)
        ]

word_transformer = Pipeline(steps)

# gridsearch params/pipeline

In [22]:
word_transformer.fit_transform(X_train)

<1977x8671 sparse matrix of type '<class 'numpy.float64'>'
	with 93631 stored elements in Compressed Sparse Row format>

In [23]:
vectorized_X_train = pd.DataFrame(
    word_transformer.fit_transform(X_train).toarray(),
    columns = [word_transformer.named_steps['tfidf_vectorizer'].get_feature_names()]
)

In [24]:
vectorized_X_train.head()

Unnamed: 0,aa,abandon,abandonment,abigail,ability,able,ably,abnormal,aboard,abolish,...,zig,zigzag,zip,zipper,zircon,zodiac,zombie,zone,zoo,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
vectorized_X_val = word_transformer.transform(X_validate)

In [37]:
cross_val_score(MultinomialNB(), vectorized_X_val, y_validate, cv = 10, scoring = 'accuracy').mean()

0.5158518595832028

In [38]:
cross_val_score(RandomForestClassifier(), vectorized_X_val, y_validate, cv = 10,  scoring = 'accuracy').mean()



0.5132101729116654

In [39]:
cross_val_score(SGDClassifier(loss = 'modified_huber'), vectorized_X_val, y_validate, cv = 10, scoring = 'accuracy').mean()

0.5142667779981214

In [40]:
vectorized_X_test = word_transformer.transform(X_train)

In [41]:
nb = MultinomialNB()

In [42]:
nb.fit(vectorized_X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
# sgd = SGDClassifier(loss = 'modified_huber')

In [31]:
# sgd.fit(vectorized_X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [63]:
pickle.dump(nb, open('nlp_model.sav', 'wb'))

In [64]:
tar = tarfile.open("nlp_model.tar.gz", "w:gz")
tar.add('nlp_model.sav')


In [43]:
vectorized_recommendations = word_transformer.transform(null_df['cleaned_lyrics'])

In [44]:
null_df.shape

(38071, 6)

In [50]:
null_df['recommendation'] = [l[1] for l in nb.predict_proba(vectorized_recommendations)]

In [51]:
null_df.to_csv('first_recommendations.csv', index = False)

In [55]:
# largest = 0
# word = None

# for k, v in count_vect_no_stops.vocabulary_.items():
#     if v > largest:
#         largest = v
#         word = k
#     else:
#         continue

In [57]:
# largest
# word

In [60]:
# def word_counts(vocab_dict):

#     words = []
#     counts = []

#     for k, v in vocab_dict.items():
#         words.append(k)
#         counts.append(v)
        
#     return pd.DataFrame(data = {
#         'words':words,
#         'counts':counts
#     }).sort_values(by = 'counts',ascending = False)

In [61]:
# word_freq = word_counts(count_vect_no_stops.vocabulary_)

NameError: name 'count_vect_no_stops' is not defined

In [62]:
# word_freq.head(30)

In [136]:
len(count_vect_no_stops.vocabulary_)

9690

In [137]:
null_df.head()

Unnamed: 0,lyrics,song_title,artist_name,liked,cleaned_lyrics,playlist
3,\n\n[Verse 1]\nI can't remember\nThe words wer...,If You Want It,Jay Som,,remember word form mouth have find bring joy p...,1.0
4,\n\n[Verse 1]\nI'm not that kind of fool\nWho ...,Superbike,Jay Som,,kind fool need read room somebody tell fall li...,1.0
5,\n\n[Verse 1]\nPoint me to my chair\nMake me s...,Peace Out,Jay Som,,point chair sing awful song bear go hard hard ...,1.0
6,\n\n[Verse 1]\nUsed to be the one to cry\nAnd ...,Devotion,Jay Som,,cry feel emotion need path find strange devoti...,1.0
7,\n\n[Verse 1]\nI'm sinking in my bed\nWe’re le...,Nighttime Drive,Jay Som,,sink bed be leave town tomorrow memory feel nu...,0.0


In [141]:
null_df_count_vect = count_vect_no_stops.transform(null_df['cleaned_lyrics'])
null_df_tfidf = tfidf_transformer.transform(null_df_count_vect)
music_to_listen = sgd_ns.predict_proba(null_df_tfidf)

In [142]:
null_df['playlist'] = music_to_listen

In [147]:
len(null_df[null_df['playlist'] > .5])
len(null_df[null_df['playlist'] < .5])

21744

16327