### Imports

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [5]:
import pickle

In [6]:
np.random.seed(42)

### Importing Data

In this notebook, I run through all datasets 

In [7]:
fiction_df = pd.read_csv('./data/fiction_sample.csv')

In [8]:
jvf_df = pd.read_csv('./data/jvf_sample.csv')

In [9]:
bio_df = pd.read_csv('./data/bio_sample.csv')

In [10]:
overall_df = pd.read_csv('./data/overall_sample.csv')

In [11]:
review_df = pd.read_csv('./data/review_sample.csv')

### Functions

* **best_params(pipeline, params, X_train, y_train)**: Reads in a pipeline, parameters, X_train, and y_train set that you've created, performs a GridSearchCV to find the best score and parameters through hypertuning. 
* **return_gs(pipeline, params, X_train, y_train)**: Returns GridSearch of a given pipeline and parameters
* **scores(gs, X_train, y_train, X_test, y_test)**: Using the returned gridsearch, the function will fit the model and perform a train-test-split to evaluate the R2 Train and Test scores.
* **predictions(pipeline, X_train, X_test, y_train)**: Returns predictions based on a pipeline and its model
* **classification_scores(model, y_test, y_pred)**: Using the predictions, it'll return recall, precision, f1, and accuracy scores for you to evaluate.

Note: Functions are reused from [my previous Subreddit project](https://git.generalassemb.ly/lisaliang/project-3.git)

In [12]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeliane,
                      param_grid = params,
                      n_jobs=-1)

    gs.fit(X_train, y_train)
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [13]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)
    return gs

In [14]:
def scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

In [15]:
def predictions(pipeline, X_train, X_test, y_train):
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_test)
    
    return prediction

In [16]:
def classification_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['Recall', 'Precision', 'F1', 'Accuracy'])
    
    recall = recall_score(y_test, y_pred, average = 'weighted')
    precision = precision_score(y_test, y_pred, average = 'weighted')
    f1 = f1_score(y_test, y_pred, average = 'weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    dataframe.loc[model] = [recall, precision, f1, accuracy]
    
    return dataframe

* **my_lemmatizer(text)**: This function lemmatizes inputted text to their dictionary forms. It adds conditions to filter out words with apostrophes or digits so they are done as accurately as possible.

Additional: We created a list of English stopwords, contractions, and numbers for the model to remove while it's iterating through the text. These attributes were seen as not adding significance in helping the model distinguish book titles.

In [17]:
def my_lemmatizer(text):
    wnet = WordNetLemmatizer()
    # exclude words with apostrophes and numbers
    return [wnet.lemmatize(w) for w in text.split() if "'" not in w and not w.isdigit()]

In [18]:
wnet = WordNetLemmatizer()
lem_stopwords = [wnet.lemmatize(w) for w in stopwords.words('english')]

contractions = ['ve', 't', "'s'", 'd', 'll', 'm', 're']
lem_contractions = [wnet.lemmatize(contraction) for contraction in contractions]

numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
lem_numbers = [wnet.lemmatize(num) for num in numbers]

lem_stopwords = lem_stopwords + lem_contractions + lem_numbers

### Creating Models

To move forward with the project, I decided to follow through with a Multinomial Naive Bayes model. The below codes are all creating and pickling models for each of the dataset to be used in an Streamlit app. Ultimately, if a Random Forest Classifier model did run and its score was better than the MNB one, I will use that pickled model. All of this is related to the memory usage and capacity of my computer.

## Genre Datasets

### Fiction (MNB)

In [19]:
# Baseline Accuracy
fiction_df['Title'].value_counts(normalize = True)

Pride and Prejudice                                         0.03694
Brave New World                                             0.01184
Great Expectations                                          0.01102
To kill a mockingbird                                       0.00634
Alice's Adventures in Wonderland                            0.00580
                                                             ...   
Chocolate Dipped Death (A Candy Shop Mystery)               0.00002
Predator: Concrete Jungle                                   0.00002
The Gates of Damascus                                       0.00002
His Love Saved Her                                          0.00002
Miss Billings Treads the Boards (Signet Regency Romance)    0.00002
Name: Title, Length: 6722, dtype: float64

In [16]:
X_fiction = fiction_df['description']
y_fiction = fiction_df['Title']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_fiction, y_fiction, random_state=42)

In [18]:
fiction_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 500)),
    ('mnb', MultinomialNB(alpha = 0.5))
])

In [19]:
fiction_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
    'mnb__alpha': [0.05, 0.1],
    'mnb__fit_prior': [False]
}

In [20]:
fiction_gs = return_gs(fiction_pipe, fiction_params, X_train, y_train)

In [21]:
scores(fiction_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.8729866666666667, Test Score: 0.81104'

In [22]:
fiction_pred = predictions(fiction_pipe, X_train, X_test, y_train)

In [23]:
classification_scores('Fiction', y_test, fiction_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
Fiction,0.67144,0.515462,0.57021,0.67144


In [38]:
with open('fiction_pipe.pkl', 'wb') as f:
    pickle.dump(fiction_pipe, f)

### Juvenile Fiction (Random Forest)

In [20]:
# Baseline Accuracy
jvf_df['Title'].value_counts(normalize = True)

The Hobbit                               0.15596
The Giver                                0.03244
Harry Potter and The Sorcerer's Stone    0.03112
The Hobbit or There and Back Again       0.03058
Night                                    0.02210
                                          ...   
The Girl Who Wanted a Boy                0.00002
New Kind of Dreaming                     0.00002
The 13 Nights of Halloween               0.00002
Find Waldo Now                           0.00002
Chili-Chili-Chin-Chin                    0.00002
Name: Title, Length: 2874, dtype: float64

In [36]:
X_jvf = jvf_df['description']
y_jvf = jvf_df['Title']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_jvf, y_jvf, random_state=42)

In [38]:
jvf_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('rfc', RandomForestClassifier(max_features = 1_000))
])

In [39]:
jvf_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)]
}

In [40]:
jvf_gs = return_gs(jvf_pipe, jvf_params, X_train, y_train)

In [41]:
scores(jvf_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.7850666666666667, Test Score: 0.75784'

In [42]:
jvf_pred = predictions(jvf_pipe, X_train, X_test, y_train)

In [43]:
classification_scores('Juvenile Fiction', y_test, jvf_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
Juvenile Fiction,0.84768,0.774386,0.801599,0.84768


In [44]:
with open('jvf_pipe.pkl', 'wb') as f:
    pickle.dump(jvf_pipe, f)

### Biography & Autobiography (MNB)

In [21]:
# Baseline Accuracy
bio_df['Title'].value_counts(normalize = True)

It's Not About the Bike: My Journey Back to Life                             0.02324
The Princess Bride                                                           0.01960
John Adams                                                                   0.01570
Confessions of an Economic Hitman                                            0.01496
All Creatures Great and Small                                                0.01106
                                                                              ...   
THE BOY IN THE GREEN SUIT                                                    0.00002
The God I Love: A Lifetime of Walking with Jesus                             0.00002
MARY QUEEN OF SCOTS                                                          0.00002
Ronald Reagan: Our Fortieth President (Spirit of America: Our Presidents)    0.00002
Bruce Lee: The Incomparable Fighter                                          0.00002
Name: Title, Length: 2309, dtype: float64

In [17]:
X_bio = bio_df['description']
y_bio = bio_df['Title']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_bio, y_bio, random_state=42)

In [20]:
bio_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('mnb', MultinomialNB(alpha = 0.5))
])

In [21]:
bio_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
    'mnb__alpha': [0.05, 0.1],
    'mnb__fit_prior': [False]
}

In [22]:
bio_gs = return_gs(bio_pipe, bio_params, X_train, y_train)

In [23]:
scores(bio_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.8974933333333334, Test Score: 0.8844'

In [24]:
bio_pred = predictions(bio_pipe, X_train, X_test, y_train)

In [25]:
classification_scores('Biography & Autobiography', y_test, bio_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
Biography & Autobiography,0.83808,0.740321,0.779426,0.83808


In [26]:
with open('bio_pipe.pkl', 'wb') as f:
    pickle.dump(bio_pipe, f)

### Overall, Classified by Authors (MNB)

For this dataset, we are classifying by authors instead of titles.

In [23]:
# Baseline Accuracy
overall_df['authors'].value_counts(normalize = True)

J. R. R. Tolkien            0.02234
Jane Austen                 0.02020
Kurt Vonnegut               0.00886
C. S. Lewis                 0.00870
Charles Dickens             0.00848
                             ...   
William F. Buckley (Jr.)    0.00002
Ernest W. Maglischo         0.00002
Randall                     0.00002
James D. Mauseth            0.00002
Tony Miano                  0.00002
Name: authors, Length: 10441, dtype: float64

In [27]:
X_overall = overall_df['description']
y_authors = overall_df['authors']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_overall, y_authors, random_state=42)

In [31]:
overall_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('mnb', MultinomialNB(alpha = 0.5))
])

In [32]:
overall_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
    'mnb__alpha': [0.05, 0.1],
    'mnb__fit_prior': [False]
}

In [None]:
best_params(overall_pipe, overall_params, X_train, y_train)

In [33]:
overall_gs = return_gs(overall_pipe, overall_params, X_train, y_train)

In [34]:
scores(overall_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.7266133333333333, Test Score: 0.60736'

In [35]:
overall_pred = predictions(overall_pipe, X_train, X_test, y_train)

In [36]:
classification_scores('Authors, by Description', y_test, overall_pred)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
"Authors, by Description",0.56216,0.44,0.467249,0.56216


In [37]:
with open('overall_pipe.pkl', 'wb') as f:
    pickle.dump(overall_pipe, f)

### Overall Dataset, Classified by Title (MNB)

In [24]:
# Baseline Accuracy
overall_df['Title'].value_counts(normalize = True)

Pride and Prejudice                                                                                                                                                0.01592
The Hobbit                                                                                                                                                         0.01508
Great Expectations                                                                                                                                                 0.00472
Brave New World                                                                                                                                                    0.00454
Mere Christianity                                                                                                                                                  0.00444
                                                                                                                                                 

In [38]:
X_overall = overall_df['description']
y_title = overall_df['Title']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_overall, y_title, random_state=42)

In [40]:
overall_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('mnb', MultinomialNB(alpha = 0.5))
])

In [41]:
overall_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
    'mnb__alpha': [0.05, 0.1],
    'mnb__fit_prior': [False]
}

In [None]:
overall_gs = return_gs(overall_pipe, overall_params, X_train, y_train)

In [42]:
scores(overall_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.7699466666666667, Test Score: 0.63784'

In [43]:
overall_pred = predictions(overall_pipe, X_train, X_test, y_train)

In [44]:
classification_scores('Authors, by Description', y_test, overall_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
"Authors, by Description",0.48008,0.328248,0.368829,0.48008


In [45]:
with open('overall_title.pkl', 'wb') as f:
    pickle.dump(overall_pipe, f)

### Most Reviewed Datasets, Classified by Authors (Random Forest)

In [26]:
# Baseline Accuracy
review_df['authors'].value_counts(normalize = True)

J. R. R. Tolkien         0.24494
Aldous Huxley            0.07648
Harper Lee               0.05576
Malcolm Gladwell         0.04654
William Golding          0.04572
                          ...   
Ransom Riggs             0.00002
Rebecca Skloot           0.00002
Michel Faber             0.00002
Christopher McDougall    0.00002
Ray Bradbury             0.00002
Name: authors, Length: 95, dtype: float64

In [58]:
X_reviews = review_df['description']
y_authors = review_df['authors']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_reviews, y_authors, random_state=42)

In [None]:
review_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('rfc', RandomForestClassifier(max_features = 1_000))
])

In [None]:
review_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
}

In [62]:
review_gs = return_gs(review_pipe, review_params, X_train, y_train)

In [63]:
scores(review_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.9822933333333334, Test Score: 0.98208'

In [64]:
review_pred = predictions(review_pipe, X_train, X_test, y_train)

In [65]:
classification_scores('Authors, by Reviews', y_test, review_pred)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
"Authors, by Reviews",0.99576,0.991808,0.993752,0.99576


In [66]:
with open('review_desc.pkl', 'wb') as f:
    pickle.dump(review_pipe, f)