In [None]:
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive')
abs_path = '/content/drive/My Drive/Statistical Deep Learning/Final Project/'
data_path = '/content/drive/My Drive/Statistical Deep Learning/Final Project/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Autorship Identification
By Nick Blackmore and Samuel Homan

In [None]:
import math
import matplotlib.pyplot as plt
import keras
import pandas as pd
import numpy as np
import os
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

from scipy.stats import logistic

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

## 1. Pre-Processing

### Creating the Training and Validation sets.

The training data will consist of 14 novels written by five authors. Each novel will be broken up into random selections (of about 5 lines each) of the 15 novels and lablels correpsonding to their respective authors. Ten percent of the data will be held out for validation during training. The testing data will be created from different novels entirely to see how each of the models will generalize. This is a scenario that is somewhat unique to authorship identification. In fact, we expect our model to overfit the training data compared to the testing data, bu our model should do well compared to the validation data. 

In [None]:
titles = os.listdir(data_path)
titles.sort()
titles.remove("test")
print(titles)
titles = [i.split(".")[0] for i in titles]
authors = [0,1,2,3,4,4,2,3,0,2,1,3,0,0]
titles_dict = dict(zip(titles, authors))
titles_dict

['callofwild.txt', 'gatsby.txt', 'greenhills.txt', 'huckfinn.txt', 'littlewomen.txt', 'oldfashionedgirl.txt', 'oldmanandthesea.txt', 'pickles', 'princeandpauper.txt', 'seawolf.txt', 'sunalsorises.txt', 'tenderisthenight.txt', 'tomsawyer.txt', 'whitefang.txt']


{'callofwild': 0,
 'gatsby': 1,
 'greenhills': 2,
 'huckfinn': 3,
 'littlewomen': 4,
 'oldfashionedgirl': 4,
 'oldmanandthesea': 2,
 'pickles': 3,
 'princeandpauper': 0,
 'seawolf': 2,
 'sunalsorises': 1,
 'tenderisthenight': 3,
 'tomsawyer': 0,
 'whitefang': 0}

In [None]:
titles = os.listdir(data_path)
titles.sort()

### Reading in the Text Files

In [None]:
master_label_list = []
word_vectors = []

max_len = 0

for file in titles:
  
  if os.path.isfile(os.path.join(data_path, file)):
    try:
      with open(os.path.join(data_path, file), encoding = "utf-8") as f:
        lines = f.readlines()
        
    except:
      try:
        with open(os.path.join(data_path, file), encoding = "ISO-8859-1") as f:
          lines = f.readlines()
      except:
        pass 

    
    num_lines = len(lines)
    print(num_lines)
    indices_sample = np.random.randint(0, num_lines - 5, size=2000)
    author = titles_dict[file.split(".")[0]]

    for i in indices_sample:
      lines_sample = lines[i:i+5]
      length = len(lines_sample)
      if length > max_len:
        max_len = length
      lines_together = " ".join(lines_sample)
      lines_together = lines_together.replace("\n", "")
      lines_together = lines_together.replace("“", "")
      lines_together = lines_together.replace("”", "")
      lines_together = lines_together.replace("’", "'")

      word_vectors.append(lines_together)

      length = len(lines_together.split(" "))
      if length > max_len:
        max_len = length

      master_label_list.append(author)

print(max_len)

3031
4632
5988
9096
16449
8809
2202
6172
9109
6769
10364
6384
6235
129


### Training and Validation Data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(word_vectors, master_label_list, test_size=0.1, random_state=41)

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)) 
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))

In [None]:
for i in train_ds.take(10):
  print(i)

(<tf.Tensor: shape=(), dtype=string, numpy=b'silent except for the dip of the oars.  They spread apart after they were out of the mouth of the harbour and each one headed for the part of the ocean where he hoped to find fish.  The old man knew he was going far out and he left the smell of the land behind and rowed out into the clean early morning smell of the ocean.  He saw the'>, <tf.Tensor: shape=(), dtype=int32, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'that he must submit to the will of this new master, obey his every whim and fancy. CHAPTER III THE REIGN OF HATE Under the tutelage of the mad god, White Fang became a fiend. He was'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'rum and lighted cigarettes for him.  Then the negro, after the rum, would try for a tremendous effort and once he had the old man, who was not an old man then but was Santiago El Campeon, nearly three inches off balance.  But the old man had raised hi

### Creating the testing dataset.
Will be contructed in a similar way as the training and validation data, but with different novels. 

In [None]:
data_path_test = os.path.join(data_path, "test")

titles = os.listdir(os.path.join(data_path, "test"))
titles.sort()
print(titles)
titles = [i.split(".")[0] for i in titles]
authors = [2,4,0,3,1]
titles_dict = dict(zip(titles, authors))
titles_dict

master_label_list = []
word_vectors = []

titles = os.listdir(os.path.join(data_path,"test"))
titles.sort()

for file in titles:
  try:
    with open(os.path.join(data_path_test, file), encoding = "utf-8") as f:
      lines = f.readlines()

  except:
    with open(os.path.join(data_path_test, file), encoding = "ISO-8859-1") as f:
      lines = f.readlines()
  
  num_lines = len(lines)
  print(num_lines)
  indices_sample = np.random.randint(0, num_lines - 5, size=1000)
  author = titles_dict[file.split(".")[0]]

  spacings = num_lines // (num_lines // 5)
  for i in range(0, 5000 , 5):
    lines_sample = lines[i:i+5]
    lines_together = " ".join(lines_sample)
    lines_together = lines_together.replace("\n", "")
    lines_together = lines_together.replace("“", "")
    lines_together = lines_together.replace("”", "")
    lines_together = lines_together.replace("’", "'")

    length = len(lines_together)
    if length > max_len:
      max_len = length

    word_vectors.append(lines_together)
    master_label_list.append(author)

print("Max is",max_len)

X_test = word_vectors
y_test = master_label_list

test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)) 

for i in test_ds.take(5):
  print(i)

['belltolls.txt', 'littlemen.txt', 'martineden.txt', 'puddnhead.txt', 'thissideparadise.txt']
15905
9080
12155
4552
8728
Max is 363
(<tf.Tensor: shape=(), dtype=string, numpy=b'He lay flat on the brown, pine-needled floor of the forest, his chin on his folded arms, and high overhead the wind blew in the tops of the pine trees. The mountainside sloped gently where he lay; but below it was steep and he could see the dark of the oiled road winding through the pass. There was a stream alongside the road and far down the pass he saw'>, <tf.Tensor: shape=(), dtype=int32, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'a mill beside the stream and the falling water of the dam, white in the summer sunlight. Is that the mill? he asked. Yes. I do not remember it.'>, <tf.Tensor: shape=(), dtype=int32, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"It was built since you were here. The old mill is farther down; much below the pass. He spread the photostated military map out on the f

### Convert Our Data to Tensorflow Datasets and create batches

In [None]:
train_ds = train_ds.batch(64).shuffle(100).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(64).prefetch(tf.data.AUTOTUNE)


In [None]:
test_ds = test_ds.batch(64).prefetch(tf.data.AUTOTUNE)

## 2. Model 1

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

# creating the word embedding
VOCAB_SIZE = 10000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_ds.map(lambda text, label: text))


model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(.2),
    tf.keras.layers.Dense(5)
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['sparse_categorical_accuracy'])

In [None]:
history_1 = model.fit(train_ds, epochs=25,
                    validation_data=val_ds,
                    callbacks=[callback])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25


### Saving the model so we don't have to run it every time

In [None]:
models_path = os.path.join(abs_path, "models")
model_1_path = os.path.join(models_path, "model1")
# Save the entire model as a SavedModel.
# model.save(model_1_path)

model1 = tf.keras.models.load_model(model_1_path)

In [None]:
test_loss, test_acc = model1.evaluate(test_ds)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 3.2074942588806152
Test Accuracy: 0.5162000060081482


### Model Results and Visualizations

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

predict_x = model1.predict(test_ds) 
predictions = np.argmax(predict_x, axis=1)

y_test_labels = {0:"London", 1:"Fitzgerald", 2:"Hemingway", 3:"Twain", 4:"Alcott"}
labels = list(y_test_labels.values())

print(classification_report(np.array(y_test), predictions))
conf_matrix = pd.DataFrame(confusion_matrix(np.array(y_test), predictions), index=labels, columns=labels)
conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
conf_matrix

              precision    recall  f1-score   support

           0       0.51      0.34      0.41      1000
           1       0.37      0.59      0.46      1000
           2       0.53      0.54      0.53      1000
           3       0.54      0.51      0.52      1000
           4       0.78      0.61      0.68      1000

    accuracy                           0.52      5000
   macro avg       0.54      0.52      0.52      5000
weighted avg       0.54      0.52      0.52      5000



  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,London,Fitzgerald,Hemingway,Twain,Alcott
London,0.34,0.384,0.147,0.079,0.05
Fitzgerald,0.111,0.59,0.105,0.139,0.055
Hemingway,0.108,0.248,0.535,0.076,0.033
Twain,0.063,0.248,0.143,0.509,0.037
Alcott,0.041,0.117,0.088,0.147,0.607


In [None]:
import plotly.express as px
fig = px.imshow(conf_matrix,
                labels=dict(x="Predictions", y="Ground Truth", color="Accuracy"),
                x=labels,
                y=labels,
                text_auto=True
               )
fig.update_xaxes(side="top")
fig.show()

In [None]:
# Validation Predictions

predict_x_val = model.predict(val_ds) 
predictions_val = np.argmax(predict_x_val, axis=1)

y_test_labels = {0:"London", 1:"Fitzgerald", 2:"Hemingway", 3:"Twain", 4:"Alcott"}
labels = list(y_test_labels.values())

print(classification_report(np.array(y_val), predictions_val))
conf_matrix = pd.DataFrame(confusion_matrix(np.array(y_val), predictions_val), index=labels, columns=labels)
conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
conf_matrix

import plotly.express as px
fig = px.imshow(conf_matrix,
                labels=dict(x="Validation Predictions", y="Ground Truth", color="Accuracy"),
                x=labels,
                y=labels,
                text_auto=True
               )
fig.update_xaxes(side="top")
fig.show()

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       796
           1       0.90      0.82      0.86       404
           2       0.90      0.93      0.92       604
           3       0.89      0.92      0.91       398
           4       0.95      0.94      0.95       398

    accuracy                           0.92      2600
   macro avg       0.92      0.92      0.92      2600
weighted avg       0.92      0.92      0.92      2600




Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.



In [None]:
import plotly.graph_objects as go
epochs = list(range(1, 26))

fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y= history_1.history['sparse_categorical_accuracy'],
                    mode='lines+markers',
                    name='training accuracy'))
fig.add_trace(go.Scatter(x=epochs, y= history_1.history['val_sparse_categorical_accuracy'],
                    mode='lines+markers',
                    name='validation accuracy'))

fig.update_layout(title="Training Accuracy", xaxis_title="Epochs", yaxis_title="Accuracy")
fig.show()

In [None]:
import plotly.graph_objects as go
epochs = list(range(1, 26))

fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y= history_1.history['loss'],
                    mode='lines+markers',
                    name='training loss'))
fig.add_trace(go.Scatter(x=epochs, y= history_1.history['val_loss'],
                    mode='lines+markers',
                    name='validation loss'))

fig.update_layout(title="Training Loss", xaxis_title="Epochs", yaxis_title="Loss")
fig.show()

## 3. Model 2
### Binary Classification of Authors
We created a model that makes predictions in a binary fashion. We adjusted the testing set for this model so that there was no data imbalance, and we calculated class weights to handle the imbalannce during model training.

In [None]:
y_train_hem = [1 if i == 2 else 0 for i in y_train]
y_val_hem = [1 if i == 2 else 0 for i in y_val]
y_test_hem_temp = [1 if i == 2 else 0 for i in y_test]


# creating a balanced test dataset
init = y_test_hem_temp.count(1)
count = 0
X_test_hem = []
y_test_hem = []
for i in range(len(y_test_hem_temp)):
  if y_test_hem_temp[i] == 1:
    X_test_hem.append(X_test[i])
    y_test_hem.append(y_test_hem_temp[i])
  else:
    if count < init:
      X_test_hem.append(X_test[i])
      y_test_hem.append(y_test_hem_temp[i])
      count += 1
    else:
      continue

print(init)
print(len(X_test_hem), len(y_test_hem))

1000
2000 2000


In [None]:
train_ds_hem = tf.data.Dataset.from_tensor_slices((X_train, y_train_hem)) 
val_ds_hem = tf.data.Dataset.from_tensor_slices((X_val, y_val_hem))
test_ds_hem = tf.data.Dataset.from_tensor_slices((X_test_hem, y_test_hem))

In [None]:
train_ds_hem = train_ds_hem.batch(64).shuffle(100).prefetch(tf.data.AUTOTUNE)
val_ds_hem = val_ds_hem.batch(64).prefetch(tf.data.AUTOTUNE)
test_ds_hem = test_ds_hem.batch(64).prefetch(tf.data.AUTOTUNE)

# Creating weights to handle class imbalance of the training data. 
class_weights = {1: len(y_train_hem)/(2*y_train_hem.count(1)), 0: len(y_train_hem)/(2*y_train_hem.count(0))}
class_weights

{0: 0.6498555876471895, 1: 2.168272794662713}

### Model 2

In [None]:
VOCAB_SIZE = 10000
encoder_hem = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder_hem.adapt(train_ds_hem.map(lambda text, label: text))

model_hem = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder_hem.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(.2),
    tf.keras.layers.Dense(1)
])

model_hem.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(5e-5),
              metrics=['accuracy'])

In [None]:
history_hem = model_hem.fit(train_ds_hem, epochs=5,
                    validation_data=val_ds_hem,
                    class_weight=class_weights)

# This model trains much faster than the previous one. The number of 5 epochs was 
# chosen because we had a hard time finding a learning rate that didn't shoot past
#  the optimal loss and result in a poor model. 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_hem_path = os.path.join(models_path, "modelhem")
# Save the entire model
# model_hem.save(model_hem_path)

model_hem = tf.keras.models.load_model(model_hem_path)

In [None]:
test_loss, test_acc = model_hem.evaluate(test_ds_hem)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 1.0150622129440308
Test Accuracy: 0.6980000138282776


### Sample Predictions on Unseen Novels
In this section we wanted to show what some sample predictions looked like for our model.

In [None]:
# passage from *A Farewell to Arms*
output1 = model_hem.predict(np.array(["If people bring so much courage to this world the world has to kill them to break them, \
so of course it kills them. The world breaks every one and afterward many are strong at the broken places. \
But those that will not break it kills. It kills the very good and the very gentle and the very brave impartially. \
If you are none of these you can be sure it will kill you too but there will be no special hurry."]))
print(logistic.cdf(output1[0])) # 94 percent Hemingway -- correct

# passage from *A Moveable Feast*
output2 = model_hem.predict(np.array(["You expected to be sad in the fall. Part of you died each year when the\
 leaves fell from the trees and their branches were bare against the wind and the cold, wintery light. \
 But you knew there would always be the spring, as you knew the river would flow again after it was frozen.”"]))
print(logistic.cdf(output2[0])) # 61 percent Hemingway -- correct (but not as good)

# passage from *A Farewell to Arms*
output3 = model_hem.predict(np.array(["“The world breaks everyone and afterward many are strong at the broken places.\
 But those that will not break it kills. It kills the very good and the very gentle and the very brave impartially. \
 If you are none of these you can be sure it will kill you too but there will be no special hurry."]))
print(logistic.cdf(output3[0])) # 87 percent Hemingway -- correct

# beginning from *A Tale of Two Cities* by Charles Dickens
output4 = model_hem.predict(np.array(["It was the best of times, it was the worst of times, it was the age of wisdom, \
it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, \
it was the season of Darkness, it was the spring of hope, it was the winter of despair."]))
print(logistic.cdf(output4[0])) # 60 percent Hemingway -- wrong

# passage from *The Innocents Abroad* by Mark Twain
output5 = model_hem.predict(np.array(["Travel is fatal to prejudice, bigotry, and narrow-mindedness, and many of our \
people need it sorely on these accounts. Broad, wholesome, charitable views of men and things cannot be acquired by\
 vegetating in one little corner of the earth all one's lifetime."]))
print(logistic.cdf(output5[0])) # 49 percent Hemingway -- correct

# passage from *The Prisoner of Azkaban* by JK Rowling
output6 = model_hem.predict(np.array(["Mr. Moony presents his compliments to Professor Snape, and begs him to keep his abnormally large nose out of other people's business. \
Mr. Prongs agrees with Mr. Moony, and would like to add that Professor Snape is an ugly git. \
Mr. Padfoot would like to register his astonishment that an idiot like that ever became a professor. \
Mr. Wormtail bids Professor Snape good day, and advises him to wash his hair, the slimeball."]))
print(logistic.cdf(output6[0])) # 48.8 percent Hemingway -- correct

[0.94089621]
[0.61032006]
[0.87944988]
[0.60728842]
[0.49633349]
[0.4881808]


### Results (Accuracy)

Exploration of the Validation results set told us that the optimal decision boundary was at the output of the model transformed by the sigmoid at 0.55. For brevity we are not showing the validattion results for the changed decision boundary, but rather the updated decision boundary results for the test set (completely unseen novels).

In [None]:
predict_x = model_hem.predict(test_ds_hem).flatten()
predictions_hem = [1 if logistic.cdf(i) > 0.55 else 0 for i in predict_x]

y_test_labels = {0:"Not Hemingway", 1:"Hemingway"}
labels = list(y_test_labels.values())


print(classification_report(np.array(y_test_hem), predictions_hem))
conf_matrix = pd.DataFrame(confusion_matrix(np.array(y_test_hem), predictions_hem), index=labels, columns=labels)
conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
conf_matrix

import plotly.express as px
fig = px.imshow(conf_matrix,
                labels=dict(x="Predictions", y="Ground Truth", color="Accuracy"),
                x=labels,
                y=labels,
                text_auto=True
               )
fig.update_xaxes(side="top")
fig.show()
conf_matrix

              precision    recall  f1-score   support

           0       0.75      0.82      0.78      1000
           1       0.80      0.73      0.77      1000

    accuracy                           0.78      2000
   macro avg       0.78      0.78      0.78      2000
weighted avg       0.78      0.78      0.78      2000




Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.



Unnamed: 0,Not Hemingway,Hemingway
Not Hemingway,0.818,0.182
Hemingway,0.267,0.733


# Model 3: Part of speech analysis. 
Training a model on only the part of speech rather than the vocabulary choices of each author. Our idea is that some authors may repeat certain words that make them easier to identify. While this is also of interest, we are also curious about the whether neural networks can pick up on sentence structure as well as on vocabulary choice. To test this choice. We will be encoding every word in our text corpus as it's part of speech (POS). We will then train this data on neural networks to see. 

For this model we chose a convolutional neural network instead. The reasons for this were two-fold. First, the CNN did far better in terms of predictions. And second, we really felt like a CNN would better be able to find complex patterns between the positions of each part of speech.

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Tag each word in the dataset with its part of speech (POS)

In [None]:
def tag_pos(data):
  tag_vecs = []
  for text in data:
    if len(tag_vecs) % 100 == 0:
      print(len(tag_vecs))
    tags = nltk.pos_tag(text)
    tag_list = [j for i,j in tags]
    tag_list = " ".join(tag_list)
    tag_vecs.append(tag_list)
  return tag_vecs

X_train_pos = tag_pos(X_train)
X_val_pos = tag_pos(X_val)
X_test_pos = tag_pos(X_test_hem)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [None]:
pickle_path = os.path.join(data_path, "pickles")
import pickle

# The below code just saves files that take a long time to generate for ease of
# access. The chunk of code only needs to be run one time.
"""with open(os.path.join(pickle_path, "X_train_pos.pkl"), 'wb') as f:
  pickle.dump(X_train_pos, f)

with open(os.path.join(pickle_path, "X_val_pos.pkl"), 'wb') as f:
  pickle.dump(X_val_pos, f)

with open(os.path.join(pickle_path, "X_test_pos.pkl"), 'wb') as f:
  pickle.dump(X_test_pos, f)

with open(os.path.join(pickle_path, "y_train_pos.pkl"), 'wb') as f:
  pickle.dump(y_train_hem, f)

with open(os.path.join(pickle_path, "y_val_pos.pkl"), 'wb') as f:
  pickle.dump(y_val_hem, f)

with open(os.path.join(pickle_path, "y_test_pos.pkl"), 'wb') as f:
  pickle.dump(y_test_hem, f)"""

'with open(os.path.join(pickle_path, "X_train_pos.pkl"), \'wb\') as f:\n  pickle.dump(X_train_pos, f)\n\nwith open(os.path.join(pickle_path, "X_val_pos.pkl"), \'wb\') as f:\n  pickle.dump(X_val_pos, f)\n\nwith open(os.path.join(pickle_path, "X_test_pos.pkl"), \'wb\') as f:\n  pickle.dump(X_test_pos, f)\n\nwith open(os.path.join(pickle_path, "y_train_pos.pkl"), \'wb\') as f:\n  pickle.dump(y_train_hem, f)\n\nwith open(os.path.join(pickle_path, "y_val_pos.pkl"), \'wb\') as f:\n  pickle.dump(y_val_hem, f)\n\nwith open(os.path.join(pickle_path, "y_test_pos.pkl"), \'wb\') as f:\n  pickle.dump(y_test_hem, f)'

In [None]:
with open(os.path.join(pickle_path, "X_train_pos.pkl"), 'rb') as f:
  X_train_pos = pickle.load(f)

with open(os.path.join(pickle_path, "X_test_pos.pkl"), 'rb') as f:
  X_test_pos = pickle.load(f)

with open(os.path.join(pickle_path, "X_val_pos.pkl"), 'rb') as f:
  X_val_pos = pickle.load(f)

with open(os.path.join(pickle_path, "y_train_pos.pkl"), 'rb') as f:
  y_train_pos = pickle.load(f)

with open(os.path.join(pickle_path, "y_test_pos.pkl"), 'rb') as f:
  y_test_pos = pickle.load(f)

with open(os.path.join(pickle_path, "y_val_pos.pkl"), 'rb') as f:
  y_val_pos = pickle.load(f)

### One hot encoding each part of speech so that we can use a CNN. Otherwises the Data set would not be two dimensional!

In [None]:
temp = [i.split(" ") for i in X_train_pos] + [i.split(" ") for i in X_test_pos] + [i.split(" ") for i in X_val_pos]
pos = np.concatenate(np.array(temp).flatten()).tolist()
final_pos = list(set(pos))

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(final_pos)

X_train_pos1 = [le.transform(i.split(" ")).tolist() for i in X_train_pos]
X_val_pos1 = [le.transform(i.split(" ")).tolist() for i in X_val_pos]
X_test_pos1 = [le.transform(i.split(" ")).tolist() for i in X_test_pos]


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



### PAdding sequences so they are all the same length
In our previous LSTM's, the model Text Vectorization handles this for us. Here, we had to do it ourselves. 

In [None]:
from keras.preprocessing import sequence


X_train_pos1 = sequence.pad_sequences(X_train_pos1, maxlen=max_len)
X_val_pos1 = sequence.pad_sequences(X_val_pos1, maxlen=max_len)
X_test_pos1 = sequence.pad_sequences(X_test_pos1, maxlen=max_len)

In [None]:
X_train_pos1.shape, X_test_pos1.shape

((23400, 363), (2000, 363))

### Model 3

In [None]:
callback2 = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=4)

# Building the CNN Model
model2 = Sequential()      # initilaizing the Sequential nature for CNN model

model2.add(Embedding(len(final_pos), 10, input_length=max_len))
model2.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Flatten())
model2.add(Dense(256, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1))
model2.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-5), metrics=['accuracy'])
model2.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 363, 10)           330       
                                                                 
 conv1d_3 (Conv1D)           (None, 363, 128)          5248      
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 181, 128)         0         
 1D)                                                             
                                                                 
 conv1d_4 (Conv1D)           (None, 181, 64)           32832     
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 90, 64)           0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 90, 32)           

In [None]:
history_pos = model2.fit(X_train_pos1, np.array(y_train_pos), validation_data=(X_val_pos1, np.array(y_val_pos)),
           epochs=70, batch_size=60, class_weight=class_weights, callbacks=[callback2])

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70


In [None]:
model_pos_path = os.path.join(models_path, "modelpos")
# Save the entire model as a SavedModel.
# model2.save(model_pos_path)

model_pos = tf.keras.models.load_model(model_pos_path)

In [None]:
scores = model_pos.evaluate(X_test_pos1, np.array(y_test_pos), verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 58.85%


### Results

In [None]:
# manually tuning the decision boundary like we did in the previous part. I 
# oringinally did this on the validation data only which is how I got .57.

from sklearn.metrics import confusion_matrix, classification_report

predict_x = model_pos.predict(X_test_pos1) 
predictions = [1 if logistic.cdf(i) > .57 else 0 for i in predict_x]

y_test_labels = {0:"Not Hemingway", 1:"Hemingway"}
labels = list(y_test_labels.values())

print(classification_report(np.array(y_test_pos), predictions))
conf_matrix = pd.DataFrame(confusion_matrix(np.array(y_test_pos), predictions), index=labels, columns=labels)
conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]


import plotly.express as px
fig = px.imshow(conf_matrix,
                labels=dict(x="Predictions", y="Ground Truth", color="Accuracy"),
                x=labels,
                y=labels,
                text_auto=True
               )
fig.update_xaxes(side="top")
fig.show()

conf_matrix

              precision    recall  f1-score   support

           0       0.64      0.78      0.70      1000
           1       0.72      0.56      0.63      1000

    accuracy                           0.67      2000
   macro avg       0.68      0.67      0.66      2000
weighted avg       0.68      0.67      0.66      2000




Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.



Unnamed: 0,Not Hemingway,Hemingway
Not Hemingway,0.782,0.218
Hemingway,0.445,0.555


In [None]:
import plotly.graph_objects as go
epochs = list(range(1, 49))

fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y= history_pos.history['accuracy'],
                    mode='lines+markers',
                    name='training accuracy'))
fig.add_trace(go.Scatter(x=epochs, y= history_pos.history['val_accuracy'],
                    mode='lines+markers',
                    name='validation accuracy'))

fig.update_layout(title="Training Accuracy", xaxis_title="Epochs", yaxis_title="Accuracy")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y= history_pos.history['loss'],
                    mode='lines+markers',
                    name='training loss'))
fig.add_trace(go.Scatter(x=epochs, y= history_pos.history['val_loss'],
                    mode='lines+markers',
                    name='validation loss'))

fig.update_layout(title="Training Loss", xaxis_title="Epochs", yaxis_title="Loss")
fig.show()

### Please see our poster and powerpoint for a more in depth treatment of the methods we used and our conclusions about the effectiveness of the project.