In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

In [2]:
fiction_df = pd.read_csv('./data/fiction_sample.csv')

### Functions

In [3]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)

    gs.fit(X_train, y_train)
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [4]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)
    return gs

In [5]:
def scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

In [6]:
def predictions(pipeline, X_train, X_test, y_train):
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_test)
    
    return prediction

In [7]:
def classification_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['Recall', 'Precision', 'F1', 'Accuracy'])
    
    recall = recall_score(y_test, y_pred, average = 'weighted')
    precision = precision_score(y_test, y_pred, average = 'weighted')
    f1 = f1_score(y_test, y_pred, average = 'weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    dataframe.loc[model] = [recall, precision, f1, accuracy]
    
    return dataframe

## Fiction Genre

### Baseline Accuracy 

In [8]:
fiction_df['Title'].value_counts(normalize = True)

Pride and Prejudice                                         0.03694
Brave New World                                             0.01184
Great Expectations                                          0.01102
To kill a mockingbird                                       0.00634
Alice's Adventures in Wonderland                            0.00580
                                                             ...   
Chocolate Dipped Death (A Candy Shop Mystery)               0.00002
Predator: Concrete Jungle                                   0.00002
The Gates of Damascus                                       0.00002
His Love Saved Her                                          0.00002
Miss Billings Treads the Boards (Signet Regency Romance)    0.00002
Name: Title, Length: 6722, dtype: float64

In [9]:
X = fiction_df['description']
y = fiction_df['Title']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
def my_lemmatizer(text):
    wnet = WordNetLemmatizer()
    # exclude words with apostrophes and numbers
    return [wnet.lemmatize(w) for w in text.split() if "'" not in w and not w.isdigit()]

In [12]:
wnet = WordNetLemmatizer()
lem_stopwords = [wnet.lemmatize(w) for w in stopwords.words('english')]

contractions = ['ve', 't', "'s'", 'd', 'll', 'm', 're']
lem_contractions = [wnet.lemmatize(contraction) for contraction in contractions]

numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
lem_numbers = [wnet.lemmatize(num) for num in numbers]

lem_stopwords = lem_stopwords + lem_contractions + lem_numbers

### MultinomialNB (and hyptertuning)

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
mnb_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 5_000)),
    ('mnb', MultinomialNB(alpha = 0.5))
])

In [15]:
mnb_params = {
    'tf__min_df': [0.1, 0.25, 0.5, 1.0],
    'tf__max_df': [0.25, 0.5, 0.8, 1.0],
    'tf__ngram_range': [(1,1), (2,2), (3,3)],
    'mnb__alpha': [0.1, 0.25, 0.5, 1],
    'mnb__fit_prior': [True, False]
}

In [22]:
best_params(mnb_pipe, mnb_params, X_train, y_train)

1320 fits failed out of a total of 1920.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
840 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py

"Best Score: 0.7210666666666666, Params: {'mnb__alpha': 0.1, 'mnb__fit_prior': False, 'tf__max_df': 0.5, 'tf__min_df': 0.1, 'tf__ngram_range': (1, 1)}"

Note: The scores ran a long warning, so below is the output that it got before being cleared.

"Best Score: 0.7210666666666666, Params: {'mnb__alpha': 0.1, 'mnb__fit_prior': False, 'tf__max_df': 0.5, 'tf__min_df': 0.1, 'tf__ngram_range': (1, 1)}"

In [23]:
mnb_gs = return_gs(mnb_pipe, mnb_params, X_train, y_train)

In [24]:
scores(mnb_gs, X_train, y_train, X_test, y_test)

1320 fits failed out of a total of 1920.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
840 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py

'Train Score: 0.7585866666666666, Test Score: 0.70728'

Note: The scores ran a long warning, so below is the output that it got before being cleared.

'Train Score: 0.7585866666666666, Test Score: 0.70728'

In [25]:
mnb_pred = predictions(mnb_pipe, X_train, X_test, y_train)

In [26]:
classification_scores('Multinomial Naive Bayes', y_test, mnb_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
Multinomial Naive Bayes,0.71232,0.580423,0.625477,0.71232


### Random Forest Classification (and hypertuning)

In [13]:
rfc_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('rfc', RandomForestClassifier(max_features = 1_000))
])

In [15]:
rfc_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
    'rfc__n_estimators': [100, 200, 300],
    'rfc__max_depth': [None, 5, 10, 20]
}

In [None]:
best_params(rfc_pipe, rfc_params, X_train, y_train)



In [None]:
rfc_gs = return_gs(rfc_pipe, rfc_params, X_train, y_train)

In [None]:
scores(rfc_gs, X_train, y_train, X_test, y_test)

In [None]:
rfc_pred = predictions(rfc_pipe, X_train, X_test, y_train)

In [None]:
classification_scores('Random Forest Classifier', y_test, rfc_pred)

### Logistic Regression

In [18]:
lr_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('lr', LogisticRegression(solver = 'saga'))
])

In [19]:
lr_params = {
    'tf__min_df': [0.05, 0.1, 0.25, 0.5],
    'tf__max_df': [0.25, 0.5, 0.8],
    'tf__ngram_range': [(1,1), (2,2), (3,3)],
    'lr__penalty': ['l1', 'l2', 'elasticnet', None],
    'lr__C': [0.05, 1.0, 10],
    'lr__class_weight': [None, 'balanced']
}

In [None]:
best_params(lr_pipe, lr_params, X_train, y_train)

In [None]:
lr_gs = return_gs(lr_pipe, lr_params, X_train, y_train)

In [None]:
scores(lr_gs, X_train, y_train, X_test, y_test)

In [None]:
lr_pred = predictions(lr_pipe, X_train, X_test, y_train)

In [None]:
classification_scores('Logistic Regression', y_test, lr_pred)

### Support Vector

In [None]:
# lol you can try this if your computer has capacity, but it crashed my computer EVERY SINGLE TIMe

In [35]:
sv_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('sv', SVC())
])

In [36]:
sv_params = {
    'tf__min_df': [0.05, 0.1, 0.25, 0.5],
    'tf__max_df': [0.25, 0.5, 0.8],
    'tf__ngram_range': [(1,1), (2,2), (3,3)],
    'sv__C': [0.5, 1, 10],
    'sv__kernel': ['linear', 'poly', 'rbf'],
    'sv__class_weight': [None, 'balanced']
}

In [None]:
best_params(sv_pipe, sv_params, X_train, y_train)



In [None]:
sv_gs = return_gs(sv_pipe, sv_params, X_train, y_train)

In [None]:
scores(sv_gs, X_train, y_train, X_test, y_test)

In [None]:
sv_pred = predictions(sv_pipe, X_train, X_test, y_train)

In [None]:
classification_scores('Support Vector', y_test, sv_pred)

### DataFrame to Review

In [30]:
table = classification_scores('Multinomial Naive Bayes', y_test, mnb_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
table = pd.concat([table, classification_scores('Random Forest Classifier', y_test, rfc_pred)])

In [None]:
table = pd.concat([table, classification_scores('Logistic Regression', y_test, lr_pred)])

In [None]:
table = pd.concat([table, classification_scores('Support Vector', y_test, sv_pred)])

In [31]:
table

Unnamed: 0,Recall,Precision,F1,Accuracy
Multinomial Naive Bayes,0.71232,0.580423,0.625477,0.71232


In [None]:
table