### Part 2 of the project: building and pickling the model

In [35]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle

In [36]:
lyrics_df = pd.read_csv("/data/lyrics.csv")
lyrics = lyrics_df["lyrics"].tolist()

In [None]:
vectorizer = TfidfVectorizer()
y = ["David Bowie"] * 100 + ["Tina Turner"] * 100
X_train, X_test, y_train, y_test = train_test_split(lyrics, y, test_size = 0.2, random_state = 314)

m = Pipeline([
    ("vectorizer", vectorizer),
    ("model", LogisticRegression())
    ])
m.fit(X_train, y_train)

In [38]:
with open("artist_predictor.pkl", "wb") as model_file:
	pickle.dump(m, model_file)

## Doing the same but with naive Bayes

unsmoothed

In [None]:
from sklearn.naive_bayes import MultinomialNB

mb = Pipeline([
    ("vectorizer", vectorizer),
    ("model", MultinomialNB(alpha = 0, force_alpha = True))
    ])
mb.fit(X_train, y_train)

In [40]:
with open("artist_predictor_bayes.pkl", "wb") as model_file:
	pickle.dump(mb, model_file)

smoothed

In [None]:
from sklearn.naive_bayes import MultinomialNB

mbs = Pipeline([
    ("vectorizer", vectorizer),
    ("model", MultinomialNB())
    ])
mbs.fit(X_train, y_train)

In [42]:
with open("artist_predictor_bayes_smooth.pkl", "wb") as model_file:
	pickle.dump(mbs, model_file)

In [45]:
lr_train = str(m.score(X_train, y_train))
lr_test = str(m.score(X_test, y_test))
nb_train = str(mb.score(X_train, y_train))
nb_test = str(mb.score(X_test, y_test))
nbs_train = str(mbs.score(X_train, y_train))
nbs_test = str(mbs.score(X_test, y_test))

print("Logistic regression: trained data accuracy: " + lr_train)
print("Logistic regression: test data accuracy: " + lr_test)
print("Naive Bayes: trained data accuracy: " + nb_train)
print("Naive Bayes: test data accuracy: " + nb_test)
print("Naive Bayes (smoothed): trained data accuracy: " + nbs_train)
print("Naive Bayes (smoothed): test data accuracy: " + nbs_test)

Logistic regression: trained data accuracy: 0.9625
Logistic regression: test data accuracy: 0.675
Naive Bayes: trained data accuracy: 0.9875
Naive Bayes: test data accuracy: 0.725
Naive Bayes (smoothed): trained data accuracy: 0.8625
Naive Bayes (smoothed): test data accuracy: 0.4
