In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import sklearn
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.vocab import GloVe
import spacy
import nltk
from nltk.stem import WordNetLemmatizer 
import plotly.graph_objects as go
nltk.download('wordnet')
nltk.download('punkt') # Download this as this allows you to tokenize words in a string.
lemmatizer = WordNetLemmatizer()

import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
# Load in data, and split it into training and testing
# This will create train/test csvs
!python extract_sentences.py
!python sample_data.py

In [32]:
# Combine Torchtext with Spacy 
# https://towardsdatascience.com/how-to-use-torchtext-for-neural-machine-translation-plus-hack-to-make-it-5x-faster-77f3884d95
text = Field(sequential = True, tokenize = 'spacy', lower=True, include_lengths=True)
score = Field(sequential = False, use_vocab = False)

data_fields = [('text', text), ('Author_num', score)]

train = TabularDataset(path = 'train.csv', format = 'csv', fields = data_fields, skip_header = True)
test = TabularDataset(path = 'test.csv', format = 'csv', fields = data_fields, skip_header = True)

text.build_vocab(train, test, min_freq = 3, vectors = GloVe(name = '6B', dim = 100))


Spacy model "en" could not be loaded, trying "en_core_web_sm" instead



In [33]:
# Iterators
# Based on https://github.com/arthtalati/Deep-Learning-based-Authorship-Identification/blob/master/Article_level_lstm.ipynb
# And torch documentation
device = torch.device('cuda:0')
BATCH_SIZE = 64

train_iterator = BucketIterator(train,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    repeat = False,
    shuffle = True,
    device = device
)

test_iterator = BucketIterator(test,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    repeat = False,
    shuffle = True,
    device = device
)

In [34]:
# NETWORK ARCHITECTURE
class AuthorClassifier(nn.Module):
    def __init__(self, mode, output_size, hidden_size, vocab_size, embedding_length, word_embeddings):
        super(AuthorClassifier, self).__init__()
        
        self.mode = mode
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        self.embedding = nn.Embedding(self.vocab_size,self.embedding_length)
        self.embedding.weight = nn.Parameter(word_embeddings,requires_grad = False)

        if self.mode == 'rnn':
            self.network = nn.RNN(self.embedding_length,self.hidden_size)
        elif self.mode == 'lstm':
            self.network = nn.LSTM(self.embedding_length,self.hidden_size)
        elif self.mode == 'gru':
            self.network = nn.GRU(self.embedding_length,self.hidden_size)
        elif self.mode == 'bilstm':
            self.network = nn.LSTM(self.embedding_length,self.hidden_size,bidirectional = True)

        self.fclayer = nn.Linear(self.hidden_size, self.output_size)
    
    def forward(self, text, text_lengths):
      text_embeddings = self.embedding(text)
      pack_sequence = nn.utils.rnn.pack_padded_sequence(text_embeddings,text_lengths.to('cpu'))

      if self.mode in ('lstm','bilstm'):
        a,(hidden,cell) = self.network(pack_sequence)
        if self.mode == 'bilstm':
          hidden = hidden[0,:,:]+ hidden[1,:,:]
      else:
        a,hidden = self.network(pack_sequence) 
      hidden = hidden.squeeze(0)
      pred = self.fclayer(hidden)
      return pred



In [35]:
def train_classifier(model, dataset_iterator, loss_function, optimizer, num_epochs, log = "runs", verbose = False, recurrent = True):
 
  model.train()
  step = 0
  f1score_train = []
  accuracy_train = []
  loss_train = []
  for epoch in range(num_epochs):
    correct = 0
    total = 0
    total_loss = 0
    f1 = 0
    f1_step = 0
    
    for batch in dataset_iterator:
      comment, comment_lengths = batch.text
      labels = batch.Author_num

      batch_size = len(labels)
      optimizer.zero_grad()
      output = model(comment, comment_lengths).squeeze(0)

      loss = loss_function(output, labels.long())
      loss.backward() 
      nn.utils.clip_grad_norm_(model.parameters(),0.5)
      optimizer.step()

      pred = torch.max(output.data,1).indices
      f1 += sklearn.metrics.f1_score((labels.cpu()).numpy(), (pred.cpu()).numpy(),average= 'macro')
      correct += (torch.sum(pred == labels)).item()
      total += len(labels)
      total_loss += loss.item()
      f1_step += 1

      step = step + 1
    f1score_train.append(f1/f1_step)
    loss_train.append(total_loss/total)
    accuracy_train.append(correct/total)
    print('---Training statistics---',"Epoch: %s Acc: %s Loss: %s"%(epoch+1, correct/total, total_loss/total),'F1 Score:',f1/f1_step,)
    return loss_train,f1score_train,accuracy_train
  
def evaluate_classifier(model, dataset_iterator, loss_function, recurrent = True):
  model.eval()

  correct = 0
  total = 0
  total_loss = 0
  overall_pred = []
  overall_label = []
  accuracy_test = []
  loss_test = []
  f1_step = 0
  f1 = 0

  for batch in dataset_iterator:
    comment, comment_lengths = batch.text
    labels = batch.Author_num
    output = model(comment, comment_lengths).squeeze(0)
    loss = loss_function(output, labels.long())
    pred = torch.max(output.data,1).indices 
    correct += (torch.sum(pred == labels)).item()
    total += len(labels)
    total_loss += loss.item()
    ap = pred.cpu()
    a = np.asarray(ap)
    labels = labels.cpu()
    b = np.asarray(labels)
    f1_step += 1
    overall_pred.append(a)
    overall_label.append(b)

  overall_p= [val for sublist in overall_pred for val in sublist]
  overall_l = [val for sublist in overall_label for val in sublist]
  f1ss = sklearn.metrics.f1_score(overall_l,overall_p,average= 'macro')
  accuracy_test.append(correct/total)
  loss_test.append(total_loss/total)
  print("Validation statistics: Acc: %s Loss: %s"%(correct/total, total_loss/total),'F1 Score:',f1ss)
  return overall_pred,overall_label,accuracy_test,f1ss,loss_test

In [36]:
output_size = 50
hidden_size = 300
vocab_size = len(text.vocab)
embedding_length = 100
word_embeddings = text.vocab.vectors
num_epochs = 1
mode = 'lstm'

model = AuthorClassifier(mode, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = model.to(device)


loss_function = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
log_dir = 'runs/lstm1'
final_acc_train_lstm  = []
final_loss_train_lstm = []
final_loss_test_lstm = []
final_acc_test_lstm = []
final_f1score_train_lstm = []
final_f1score_test_lstm = []


for multi in range(20):
  loss_train,f1score,accs = train_classifier(model, train_iterator, loss_function, optimizer, log = log_dir, num_epochs = num_epochs)
  overall_pred,overall_label,accs_test,f1ss,loss_test = evaluate_classifier(model, test_iterator, loss_function)
  final_acc_train_lstm.append(accs[0])
  final_acc_test_lstm.append(accs_test[0])
  final_f1score_train_lstm.append(f1score[0])
  final_f1score_test_lstm.append(f1ss)
  final_loss_train_lstm.append(loss_train[0])
  final_loss_test_lstm.append(loss_test[0])


---Training statistics--- Epoch: 1 Acc: 0.15893181228934405 Loss: 0.034079162775311166 F1 Score: 0.032956453969076836
Validation statistics: Acc: 0.14186327888687236 Loss: 0.03475298195698733 F1 Score: 0.0248476821192053
---Training statistics--- Epoch: 1 Acc: 0.1590614467202489 Loss: 0.032712576084567194 F1 Score: 0.03300337510190789
Validation statistics: Acc: 0.1512401693889897 Loss: 0.03236046327076316 F1 Score: 0.026274303730951125
---Training statistics--- Epoch: 1 Acc: 0.16385792066372828 Loss: 0.03267246260678858 F1 Score: 0.03402608766803963
Validation statistics: Acc: 0.1572897761645493 Loss: 0.0326228265825792 F1 Score: 0.027182435964453737
---Training statistics--- Epoch: 1 Acc: 0.16191340420015557 Loss: 0.03243865780117044 F1 Score: 0.03391773680061555
Validation statistics: Acc: 0.1793708408953418 Loss: 0.03219214476604427 F1 Score: 0.03041805591177225
---Training statistics--- Epoch: 1 Acc: 0.16398755509463314 Loss: 0.03240022489002773 F1 Score: 0.035623429564714176
Vali

In [37]:
mode = 'gru'

model = AuthorClassifier(mode, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = model.to(device)

loss_function = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
log_dir = 'runs/gru'
final_acc_train_gru  = []
final_acc_test_gru = []
final_f1score_train_gru = []
final_f1score_test_gru = []
final_loss_train_gru = []
final_loss_test_gru = []

for multi in range(20):
  loss_train,f1score,accs = train_classifier(model, train_iterator, loss_function, optimizer, log = log_dir, num_epochs = num_epochs)
  overall_pred,overall_label,accs_test,f1ss,loss_test = evaluate_classifier(model, test_iterator, loss_function)
  final_acc_train_gru.append(accs[0])
  final_loss_train_gru.append(loss_train[0])
  final_acc_test_gru.append(accs_test[0])
  final_f1score_train_gru.append(f1score[0])
  final_f1score_test_gru.append(f1ss)
  final_loss_test_gru.append(loss_test[0])


---Training statistics--- Epoch: 1 Acc: 0.19834067928441795 Loss: 0.03346436781558095 F1 Score: 0.08133171252159806
Validation statistics: Acc: 0.3617664851784634 Loss: 0.028223931753339728 F1 Score: 0.20920330016592964
---Training statistics--- Epoch: 1 Acc: 0.4401088929219601 Loss: 0.02469852037432282 F1 Score: 0.29595910484378296
Validation statistics: Acc: 0.5520266182698125 Loss: 0.020707394055575944 F1 Score: 0.34977897151731774
---Training statistics--- Epoch: 1 Acc: 0.574410163339383 Loss: 0.018779635533694317 F1 Score: 0.41614701406587606
Validation statistics: Acc: 0.6240169388989716 Loss: 0.016168103956195418 F1 Score: 0.4710765808515732
---Training statistics--- Epoch: 1 Acc: 0.6586725434275343 Loss: 0.014667016806335439 F1 Score: 0.5208693706743649
Validation statistics: Acc: 0.677858439201452 Loss: 0.01351010215260805 F1 Score: 0.5344308371579829
---Training statistics--- Epoch: 1 Acc: 0.6966554316826549 Loss: 0.01326124872244452 F1 Score: 0.5649004323614463
Validation st

In [38]:
mode = 'bilstm'

model = AuthorClassifier(mode, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = model.to(device)


loss_function = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
log_dir = 'runs/bilstm'
final_acc_train_bilstm  = []
final_acc_test_bilstm = []
final_f1score_train_bilstm = []
final_f1score_test_bilstm = []
final_loss_train_bilstm = []
final_loss_test_bilstm = []

for multi in range(20):
  loss_train,f1score,accs = train_classifier(model, train_iterator, loss_function, optimizer, log = log_dir, num_epochs = num_epochs)
  overall_pred,overall_label,accs_test,f1ss,loss_test = evaluate_classifier(model, test_iterator, loss_function)
  final_acc_train_bilstm.append(accs[0])
  final_loss_train_bilstm.append(loss_train[0])
  final_acc_test_bilstm.append(accs_test[0])
  final_f1score_train_bilstm.append(f1score[0])
  final_f1score_test_bilstm.append(f1ss)
  final_loss_test_bilstm.append(loss_test[0])


---Training statistics--- Epoch: 1 Acc: 0.15011667098781437 Loss: 0.03356402486539548 F1 Score: 0.031618358187567766
Validation statistics: Acc: 0.1793708408953418 Loss: 0.032409961520724055 F1 Score: 0.03041805591177225
---Training statistics--- Epoch: 1 Acc: 0.15659839253305677 Loss: 0.03236506494920675 F1 Score: 0.03498657641136685
Validation statistics: Acc: 0.22171808832425893 Loss: 0.03194044167102645 F1 Score: 0.09580730757278753
---Training statistics--- Epoch: 1 Acc: 0.2958257713248639 Loss: 0.028253316353849096 F1 Score: 0.1738965373046768
Validation statistics: Acc: 0.39473684210526316 Loss: 0.024455179320056728 F1 Score: 0.2377467019401684
---Training statistics--- Epoch: 1 Acc: 0.5035001296344309 Loss: 0.021311712768802935 F1 Score: 0.36267771339575366
Validation statistics: Acc: 0.631578947368421 Loss: 0.01676143713958034 F1 Score: 0.44256198935752344
---Training statistics--- Epoch: 1 Acc: 0.6585429089966295 Loss: 0.014890254629471156 F1 Score: 0.5171177174097854
Validat

In [39]:
fig_accuracy = go.Figure()

fig_accuracy.add_trace(go.Scatter(
    y=final_acc_train_lstm,
    connectgaps=True, marker_color='rgba(128, 0, 0, 0.9)', name = 'Training accuracy lstm'))

fig_accuracy.add_trace(go.Scatter(
    y=final_acc_test_lstm,
    connectgaps=True, marker_color='rgba(255, 0, 0, 0.9)', name = 'Testing accuracy lstm'))

fig_accuracy.add_trace(go.Scatter(
    y=final_acc_train_gru,
    connectgaps=True, marker_color='rgba(0, 128, 0, 0.9)', name = 'Training accuracy gru'))
fig_accuracy.add_trace(go.Scatter(
    y=final_acc_test_gru,
    connectgaps=True, marker_color = 'rgba(0, 255, 0, 0.9)', name = 'Testing accuracy gru'))

fig_accuracy.add_trace(go.Scatter(
    y=final_acc_train_bilstm,
    connectgaps=True, marker_color='rgba(0, 0, 128, 0.9)', name = 'Training accuracy bilstm'))
fig_accuracy.add_trace(go.Scatter(
    y=final_acc_test_bilstm, connectgaps=True, marker_color='rgba(0, 0, 255, 0.9)',
    name='Test accuracy bilstm'))

fig_accuracy.show()

fig_loss = go.Figure()

fig_loss.add_trace(go.Scatter(
    y=final_loss_train_lstm,
    connectgaps=True, marker_color='rgba(128, 0, 0, 0.9)', name = 'Training Loss lstm'))

fig_loss.add_trace(go.Scatter(
    y=final_loss_test_lstm,
    connectgaps=True, marker_color='rgba(255, 0, 0, 0.9)', name = 'Testing Loss lstm'))

fig_loss.add_trace(go.Scatter(
    y=final_loss_train_gru,
    connectgaps=True, marker_color='rgba(0, 128, 0, 0.9)', name = 'Training Loss gru'))

fig_loss.add_trace(go.Scatter(
    y=final_loss_test_gru,
    connectgaps=True, marker_color='rgba(0, 255, 0, 0.9)', name = 'Testing Loss gru'))

fig_loss.add_trace(go.Scatter(
    y=final_loss_train_bilstm,
    connectgaps=True, marker_color='rgba(0, 0, 128, 0.9)', name = 'Training Loss bilstm'))

fig_loss.add_trace(go.Scatter(
    y=final_loss_test_bilstm,
    connectgaps=True, marker_color='rgba(0, 0, 255, 0.9)', name = 'Testing Loss bilstm'))

fig_loss.show()

fig_f1score = go.Figure()

fig_f1score.add_trace(go.Scatter(
    y=final_f1score_train_lstm,
    connectgaps=True, marker_color='rgba(128, 0, 0, 0.9)', name = 'Training f1score lstm'))

fig_f1score.add_trace(go.Scatter(
    y=final_f1score_test_lstm,
    connectgaps=True, marker_color='rgba(255, 0, 0, 0.9)', name = 'Testing f1score lstm'))

fig_f1score.add_trace(go.Scatter(
    y=final_f1score_train_gru,
    connectgaps=True, marker_color='rgba(0, 128, 0, 0.9)', name = 'Training f1score gru'))
fig_f1score.add_trace(go.Scatter(
    y=final_f1score_test_gru,
    connectgaps=True, marker_color = 'rgba(0, 255, 0, 0.9)', name = 'Testing f1score gru'))

fig_f1score.add_trace(go.Scatter(
    y=final_f1score_train_bilstm,
    connectgaps=True, marker_color='rgba(0, 0, 128, 0.9)', name = 'Training f1score bilstm'))
fig_f1score.add_trace(go.Scatter(
    y=final_f1score_test_bilstm, connectgaps=True, marker_color='rgba(0, 0, 255, 0.9)',
    name='Test f1score bilstm'))

fig_f1score.show()