In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

In [3]:
import pickle

In [4]:
np.random.seed(42)

### Reading All Datasets

In [5]:
fiction_df = pd.read_csv('./data/fiction_sample.csv')

In [6]:
jvf_df = pd.read_csv('./data/jvf_sample.csv')

In [7]:
bio_df = pd.read_csv('./data/bio_sample.csv')

In [8]:
overall_df = pd.read_csv('./data/overall_sample.csv')

In [9]:
review_df = pd.read_csv('./data/review_sample.csv')

### Functions

In [10]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)

    gs.fit(X_train, y_train)
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [11]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)
    return gs

In [12]:
def scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

In [13]:
def predictions(pipeline, X_train, X_test, y_train):
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_test)
    
    return prediction

In [14]:
def classification_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['Recall', 'Precision', 'F1', 'Accuracy'])
    
    recall = recall_score(y_test, y_pred, average = 'weighted')
    precision = precision_score(y_test, y_pred, average = 'weighted')
    f1 = f1_score(y_test, y_pred, average = 'weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    dataframe.loc[model] = [recall, precision, f1, accuracy]
    
    return dataframe

In [15]:
def my_lemmatizer(text):
    wnet = WordNetLemmatizer()
    # exclude words with apostrophes and numbers
    return [wnet.lemmatize(w) for w in text.split() if "'" not in w and not w.isdigit()]

In [16]:
wnet = WordNetLemmatizer()
lem_stopwords = [wnet.lemmatize(w) for w in stopwords.words('english')]

contractions = ['ve', 't', "'s'", 'd', 'll', 'm', 're']
lem_contractions = [wnet.lemmatize(contraction) for contraction in contractions]

numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
lem_numbers = [wnet.lemmatize(num) for num in numbers]

lem_stopwords = lem_stopwords + lem_contractions + lem_numbers

### Fiction 

In [16]:
X_fiction = fiction_df['description']
y_fiction = fiction_df['Title']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_fiction, y_fiction, random_state=42)

In [18]:
fiction_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 500)),
    ('mnb', MultinomialNB(alpha = 0.5))
])

In [18]:
fiction_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 500)),
    ('rfc', RandomForestClassifier(max_features = 500))
])

In [19]:
fiction_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
    'mnb__alpha': [0.05, 0.1],
    'mnb__fit_prior': [False]
}

In [None]:
best_params(fiction_pipe, fiction_params, X_train, y_train)

In [20]:
fiction_gs = return_gs(fiction_pipe, fiction_params, X_train, y_train)

In [21]:
scores(fiction_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.8729866666666667, Test Score: 0.81104'

In [22]:
fiction_pred = predictions(fiction_pipe, X_train, X_test, y_train)

In [23]:
classification_scores('Fiction', y_test, fiction_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
Fiction,0.67144,0.515462,0.57021,0.67144


In [38]:
with open('fiction_pipe.pkl', 'wb') as f:
    pickle.dump(fiction_pipe, f)

### Juvenile Fiction Model (finished)

In [36]:
X_jvf = jvf_df['description']
y_jvf = jvf_df['Title']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_jvf, y_jvf, random_state=42)

In [38]:
jvf_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('rfc', RandomForestClassifier(max_features = 1_000))
])

In [39]:
jvf_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)]
}

In [None]:
best_params(jvf_pipe, jvf_params, X_train, y_train)

In [40]:
jvf_gs = return_gs(jvf_pipe, jvf_params, X_train, y_train)

In [41]:
scores(jvf_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.7850666666666667, Test Score: 0.75784'

In [42]:
jvf_pred = predictions(jvf_pipe, X_train, X_test, y_train)

In [43]:
classification_scores('Juvenile Fiction', y_test, jvf_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
Juvenile Fiction,0.84768,0.774386,0.801599,0.84768


In [44]:
with open('jvf_pipe.pkl', 'wb') as f:
    pickle.dump(jvf_pipe, f)

### Biography & Autobiography 

In [23]:
X_bio = bio_df['description']
y_bio = bio_df['Title']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_bio, y_bio, random_state=42)

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
bio_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('rfc', RandomForestClassifier(max_features = 1_000))
])

In [27]:
bio_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
}

In [None]:
best_params(bio_pipe, bio_params, X_train, y_train)

In [28]:
bio_gs = return_gs(bio_pipe, bio_params, X_train, y_train)

In [29]:
scores(bio_gs, X_train, y_train, X_test, y_test)



KeyboardInterrupt: 

In [32]:
scores(bio_gs, X_train, y_train, X_test, y_test)



'Train Score: 0.90608, Test Score: 0.89288'

In [33]:
bio_pred = predictions(bio_pipe, X_train, X_test, y_train)

In [48]:
classification_scores('Biography & Autobiography', y_test, bio_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Recall,Precision,F1,Accuracy
Biography & Autobiography,0.0,0.0,0.0,0.0


In [35]:
with open('bio_pipe.pkl', 'wb') as f:
    pickle.dump(bio_pipe, f)

### Overall, by Authors & Desc

In [17]:
X_overall = overall_df['description']
y_authors = overall_df['authors']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_overall, y_authors, random_state=42)

In [19]:
overall_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('rfc', RandomForestClassifier(max_features = 1_000))
])

In [20]:
overall_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
}

In [None]:
best_params(overall_pipe, overall_params, X_train, y_train)

In [21]:
overall_gs = return_gs(overall_pipe, overall_params, X_train, y_train)

In [22]:
scores(overall_gs, X_train, y_train, X_test, y_test)


KeyboardInterrupt



In [None]:
overall_pred = predictions(overall_pipe, X_train, X_test, y_train)

In [None]:
classification_scores('Authors, by Description', y_test, overall_pred)

In [None]:
with open('overall_pipe.pkl', 'wb') as f:
    pickle.dump(overall_pipe, f)

### By Title & Desc

### Most Reviewed, by Authors

In [None]:
X_reviews = review_df['reviews']
y_authors = review_df['authors']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_reviews, y_authors, random_state=42)

In [None]:
review_pipe = Pipeline([
    ('tf', TfidfVectorizer(stop_words = lem_stopwords, 
                           tokenizer = my_lemmatizer,
                           token_pattern = None,
                           max_features = 1_000)),
    ('rfc', RandomForestClassifier(max_features = 1_000))
])

In [None]:
review_params = {
    'tf__min_df': [0.05, 0.1],
    'tf__max_df': [0.5],
    'tf__ngram_range': [(1,1)],
}

In [None]:
review_gs = review_gs(review_pipe, review_params, X_train, y_train)

In [None]:
scores(review_gs, X_train, y_train, X_test, y_test)

In [None]:
review_pred = predictions(review_pipe, X_train, X_test, y_train)

In [None]:
classification_scores('Authors, by Reviews', y_test, review_pred)

### Table to look at classification scores

In [45]:
table = classification_scores('Juvenile Fiction', y_test, jvf_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
table = pd.concat([table, classification_scores('Biography & Autobiography', y_test, bio_pred)])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
table

Unnamed: 0,Recall,Precision,F1,Accuracy
Juvenile Fiction,0.84768,0.774386,0.801599,0.84768
Biography & Autobiography,0.0,0.0,0.0,0.0
