In [1]:
import re
import pickle
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import spacy
from collections import Counter
from copy import deepcopy
import math

import torchtext.vocab

import string
import re

import nltk
from nltk.corpus import stopwords

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# the paths; change when necessary
TRAIN_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_train.tsv"
TEST_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_test.tsv"
TRAIN = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_train_cleaned.tsv"
TEST = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/lcp_single_test_cleaned.tsv"

In [4]:
# read the datasets
# train
with open(TRAIN_RAW, 'r') as f:
  data = f.read()

# need to remove " from the string, otherwise parsing will have problems because some quotas are not closed 
data = data.replace('"', '')

with open(TRAIN, 'w') as f:
  f.write(data)

df = pd.read_csv(TRAIN, sep='\t')

# test
with open(TEST_RAW, 'r') as f:
  data = f.read()

data = data.replace('"', '')

with open(TEST, 'w') as f:
  f.write(data)

test = pd.read_csv(TEST, sep='\t')

In [5]:
# take a look
pd.set_option('display.max_colwidth', None) # show the whole sentence
df.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass.",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,"I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book.",brothers,0.0
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,"The man, the lord of the land, said to us, 'By this I will know that you are honest men: leave one of your brothers with me, and take grain for the famine of your houses, and go your way.",brothers,0.05
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,"Shimei had sixteen sons and six daughters; but his brothers didn't have many children, neither did all their family multiply like the children of Judah.",brothers,0.15
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,He has put my brothers far from me.,brothers,0.263889


In [6]:
test.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3K8CQCU3KE19US5SN890DFPK3SANWR,bible,"But he, beckoning to them with his hand to be silent, declared to them how the Lord had brought him out of the prison.",hand,0.0
1,3Q2T3FD0ON86LCI41NJYV3PN0BW3MV,bible,"If I forget you, Jerusalem, let my right hand forget its skill.",hand,0.197368
2,3ULIZ0H1VA5C32JJMKOTQ8Z4GUS51B,bible,"the ten sons of Haman the son of Hammedatha, the Jew's enemy, but they didn't lay their hand on the plunder.",hand,0.2
3,3BFF0DJK8XCEIOT30ZLBPPSRMZQTSD,bible,"Let your hand be lifted up above your adversaries, and let all of your enemies be cut off.",hand,0.267857
4,3QREJ3J433XSBS8QMHAICCR0BQ1LKR,bible,"Abimelech chased him, and he fled before him, and many fell wounded, even to the entrance of the gate.",entrance,0.0


Try linear regression first.

In [7]:
def create_weights_matrix(vocab, dimension=100):
  """ create a matrix containing vectors for each word in Glove """
  matrix_len = len(vocab)
  weights_matrix = np.zeros((matrix_len, dimension))

  for i, word in enumerate(vocab):
      try: 
          weights_matrix[i] = glove[word]
      except KeyError:
          weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, )) # initialize a random vector
  #return torch.from_numpy(weights_matrix) # a tensor
  return weights_matrix

In [8]:
# use the Glove 6B 100d
cache_dir = "/content/gdrive/My Drive/Colab Notebooks/data"
# glove = vocab.pretrained_aliases["glove.6B.100d"](cache=cache_dir)
glove = torchtext.vocab.GloVe(name='6B', dim=100, cache=cache_dir)

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# get all the non-unique tokens for prediction
tokens = df['token'].dropna().to_list()
tokens = [token.lower() for token in tokens] # lowercase
print(len(tokens))

# check if all tokens are in Glove
for token in tokens:
  if token not in glove.stoi:
    print("Token Not Found: ")
    print(token)

7659
Token Not Found: 
perverseness
Token Not Found: 
perverseness
Token Not Found: 
perverseness
Token Not Found: 
housetops
Token Not Found: 
slanderers
Token Not Found: 
plowmen
Token Not Found: 
dainties
Token Not Found: 
dainties
Token Not Found: 
dainties
Token Not Found: 
dainties
Token Not Found: 
dainties
Token Not Found: 
dunghill
Token Not Found: 
carotids
Token Not Found: 
tace


In [11]:
# create a dataframe for linear regression
train_df = pd.DataFrame(tokens, columns =['token'])

# add back complexity
train_df['complexity'] = df['complexity']

# word length
train_df['word_length'] = train_df['token'].map(lambda x: len(x))

# punctuations
punc = string.punctuation

# stop words
stop_words = set(stopwords.words('english')) 

# word frequency
# tokenize the whole curpus
temp = df['sentence'].to_list()
texts = []
for sent in temp:
  sent = sent.lower()
  sent = ''.join([c for c in sent if c not in punc])
  words = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
  texts += words
# count frequency
count = Counter(texts)
train_df['word_frequency'] = train_df['token'].map(lambda x: count[x])

train_df.head()

Unnamed: 0,token,complexity,word_length,word_frequency
0,river,0.0,5,26
1,brothers,0.0,8,36
2,brothers,0.05,8,36
3,brothers,0.15,8,36
4,brothers,0.263889,8,36


In [12]:
# create the weight matrix
weight_matrix = create_weights_matrix(tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)

train_df_combined = pd.concat([train_df, weight_matrix_df], axis=1)
train_df_combined.head()

(7659, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,river,0.0,5,26,-0.33249,-0.56631,0.54255,-0.11869,0.53129,-0.49381,0.64114,0.85982,0.39633,-1.5395,-0.30613,0.97267,-0.31192,-0.10311,0.35951,-0.60023,0.90983,-0.95954,-0.55375,0.082818,0.26711,0.64645,-0.098556,0.53924,-0.2181,-0.1343,-1.807,-0.14879,0.39006,-0.62883,-0.38825,0.31925,0.77853,-0.60273,0.063585,-0.75916,...,-0.53185,0.72585,0.36811,0.19494,0.64276,0.8146,0.26748,-0.39275,0.42595,0.11699,0.21063,-0.061747,0.79298,-0.45978,0.85176,-0.36726,0.11816,0.50416,-0.065352,0.69672,0.37525,0.92586,-0.83036,-0.087948,-0.49715,0.21411,-0.82838,-0.85912,0.61576,1.188,-0.30745,-1.2009,-1.7097,0.514,-1.0159,0.55555,-1.0385,-0.6994,1.0506,0.24051
1,brothers,0.0,8,36,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
2,brothers,0.05,8,36,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
3,brothers,0.15,8,36,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
4,brothers,0.263889,8,36,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982


In [13]:
# get data for training
X_train = train_df_combined.drop(columns=['token', 'complexity'])
Y_train = train_df_combined['complexity']

In [14]:
# train linear regression
lr = LinearRegression().fit(X_train, Y_train)

In [15]:
# predict
Y_pred = lr.predict(X_train)

In [16]:
# train loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.07246931733686796


In [17]:
# on test
test_tokens = test['token'].dropna().to_list()
test_tokens = [token.lower() for token in test_tokens] # lowercase
print(len(test_tokens))

# create a dataframe for linear regression
test_df = pd.DataFrame(test_tokens, columns =['token'])

# add back complexity
test_df['complexity'] = test['complexity']

# word length
test_df['word_length'] = test_df['token'].map(lambda x: len(x))

# word frequency
# tokenize the whole curpus
temp = test['sentence'].to_list()
texts = []
for sent in temp:
  sent = sent.lower()
  sent = ''.join([c for c in sent if c not in punc])
  words = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
  texts += words
# count frequency
count = Counter(texts)
test_df['word_frequency'] = test_df['token'].map(lambda x: count[x])

# create the weight matrix
weight_matrix = create_weights_matrix(test_tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)
test_df_combined = pd.concat([test_df, weight_matrix_df], axis=1)
test_df_combined.head()

917
(917, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,hand,0.0,4,19,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
1,hand,0.197368,4,19,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
2,hand,0.2,4,19,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
3,hand,0.267857,4,19,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
4,entrance,0.0,8,2,0.25776,0.1068,-0.16265,0.42335,0.19078,0.46283,-0.95915,0.93174,0.47161,0.39077,0.54734,0.41967,0.086822,0.53954,0.35497,-0.028346,0.42708,0.036569,-0.497,-0.49543,-0.031232,-0.30298,-0.41718,-0.78459,0.70473,-0.59741,-0.33173,-0.38813,0.17189,-0.78565,-0.17219,-0.14019,0.61492,0.5713,0.75109,-0.015942,...,-0.60393,0.47454,0.80912,0.81709,-0.12876,-0.3931,0.17656,-0.29797,-0.32614,-0.26522,-0.37006,-0.016956,0.92268,-0.71606,-0.38524,-0.085737,0.68111,0.3208,0.4587,-0.82737,0.22932,0.3145,-0.21221,-0.65293,-0.31427,-0.037493,0.16126,-0.46719,0.63066,0.26426,0.52778,-0.34505,0.0662,0.7224,-0.11057,-0.005771,-0.059336,0.013272,0.97305,0.45405


In [18]:
# get data for test
X_test = test_df_combined.drop(columns=['token', 'complexity'])
Y_test = test_df_combined['complexity']

# predict
Y_pred = lr.predict(X_test)

# test loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_test[i])
  losses.append(loss)
abl = sum(losses) / num
print("average test absolute loss is " + str(abl))

average test absolute loss is 0.07283375821746224


LSTM

In [19]:
# tokenize sentences

def tokenize(sent, token, punc, stop_words):
  """ lowercase, padded, remove stopwords and punctuations """
  # lowercase
  sent = sent.lower()
  # remove punctuation and stopwords
  sent = ''.join([c for c in sent if c not in punc]) 
  tokens = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
  # pad
  tokens.insert(0, '<s>')
  tokens.append('</s>')
  # pad the token with special symbols
  for i in range(len(tokens)):
    if tokens[i] == token:
      tokens.insert(i, '_START')
      tokens.insert(i+2, '_END')
      break

  return tokens

def preprocess(df):
  data = df[['sentence', 'token', 'complexity']]
  data['tokenized_sentence'] = data.apply(lambda row: tokenize(row['sentence'], row['token'], punc, stop_words), axis=1)
  data = data.drop(columns=['sentence'])
  return data

train_data = preprocess(df)
test_data = preprocess(test)

train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,token,complexity,tokenized_sentence
0,river,0.0,"[<s>, behold, came, _START, river, _END, seven, cattle, sleek, fat, fed, marsh, grass, </s>]"
1,brothers,0.0,"[<s>, fellow, bondservant, _START, brothers, _END, prophets, keep, words, book, </s>]"
2,brothers,0.05,"[<s>, man, lord, land, said, us, know, honest, men, leave, one, _START, brothers, _END, take, grain, famine, houses, go, way, </s>]"
3,brothers,0.15,"[<s>, shimei, sixteen, sons, six, daughters, _START, brothers, _END, didnt, many, children, neither, family, multiply, like, children, judah, </s>]"
4,brothers,0.263889,"[<s>, put, _START, brothers, _END, far, </s>]"


In [20]:
# convert words to index for training and testing purpose
sentences = train_data['tokenized_sentence'].to_list()
temp = []
for sent in sentences:
  temp += sent
temp = set(temp)
# for words that are unknown
temp.add('_UNKNOWN') 
temp.add('_PADDING')
print(len(temp))

# need to pad sentences to the same length
lengths = [len(sent) for sent in sentences]
pad_length = max(lengths)
print(pad_length)

# construct dictionaries
word2index = {}
index2word = {}
for i, word in enumerate(temp):
  word2index[word] = i
  index2word[i] = word

def word_to_index(sentence):
  # sentence: a list of strings
  r = []
  for word in sentence:
    if word in word2index:
      r.append(word2index[word])
    else:
      r.append(word2index['_UNKNOWN'])
  diff = pad_length - len(sentence)
  pad_index = word2index['_PADDING']
  for i in range(diff):
    r.append(pad_index)
  return r

train_data['number_sentence'] = train_data['tokenized_sentence'].map(lambda sent: word_to_index(sent))
test_data['number_sentence'] = test_data['tokenized_sentence'].map(lambda sent: word_to_index(sent))

train_data = train_data.drop(columns=['tokenized_sentence'])
test_data = test_data.drop(columns=['tokenized_sentence'])

train_data.head()

14826
118


Unnamed: 0,token,complexity,number_sentence
0,river,0.0,"[13409, 4292, 10528, 10347, 6537, 9638, 712, 9660, 13303, 10478, 5150, 6242, 2529, 2644, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, ...]"
1,brothers,0.0,"[13409, 13627, 10664, 10347, 13026, 9638, 7290, 11929, 10967, 10746, 2644, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, ...]"
2,brothers,0.05,"[13409, 1167, 12437, 4974, 5651, 1133, 13173, 3449, 11862, 3240, 12701, 10347, 13026, 9638, 14735, 9937, 6319, 12505, 2733, 12430, 2644, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, ...]"
3,brothers,0.15,"[13409, 11540, 197, 6182, 3090, 11841, 10347, 13026, 9638, 10114, 7761, 2755, 5011, 984, 12263, 3339, 2755, 5205, 2644, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, ...]"
4,brothers,0.263889,"[13409, 3947, 10347, 13026, 9638, 11103, 2644, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, 7815, ...]"


In [21]:
# do a simple check
print(df.shape)
print(train_data.shape)
print(test_data.shape)
print(len(word2index.keys()))
print(len(index2word.keys()))

(7662, 5)
(7662, 3)
(917, 3)
14826
14826


In [22]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [23]:
train_sentences = np.array(train_data['number_sentence'].to_list())
train_labels = np.array(train_data['complexity'].to_list())
test_sentences = np.array(test_data['number_sentence'].to_list())
test_labels = np.array(test_data['complexity'].to_list())

training = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
testing = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

batch_size = 128

train_loader = DataLoader(training, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(testing, shuffle=True, batch_size=batch_size, drop_last=True)

In [24]:
# the LSTM class
class ComplexityNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(ComplexityNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        #self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x, hidden):
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)       
        out = self.fc(lstm_out[:, -1, :])      
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [25]:
# some parameters
vocab_size = len(word2index) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = ComplexityNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)
print(model)

lr=0.01
criterion = nn.L1Loss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 100
counter = 0
print_every = 256
clip = 5

ComplexityNet(
  (embedding): Embedding(14827, 400)
  (lstm): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=1, bias=True)
)


In [26]:
# training
model.train()
for i in range(epochs):
  h = model.init_hidden(batch_size)
    
  for inputs, labels in train_loader:
    counter += 1
    h = tuple([e.data for e in h])
    inputs, labels = inputs.to(device), labels.to(device)
    model.zero_grad()
    output, h = model(inputs, h)
    # cross entropy for multiple classes
    # output is of shape 64 * 5 while labels is of shape 64
    loss = criterion(output, labels) 
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()

    if counter%print_every == 0:
      print("Epoch: {}/{}...".format(i+1, epochs), "Step: {}...".format(counter), "Loss: {:.6f}...".format(loss.item()))

  return F.l1_loss(input, target, reduction=self.reduction)


Epoch: 5/100... Step: 256... Loss: 0.111072...
Epoch: 9/100... Step: 512... Loss: 0.120014...
Epoch: 14/100... Step: 768... Loss: 0.128292...
Epoch: 18/100... Step: 1024... Loss: 0.107732...
Epoch: 22/100... Step: 1280... Loss: 0.089548...
Epoch: 27/100... Step: 1536... Loss: 0.116203...
Epoch: 31/100... Step: 1792... Loss: 0.117202...
Epoch: 35/100... Step: 2048... Loss: 0.109630...
Epoch: 40/100... Step: 2304... Loss: 0.106438...
Epoch: 44/100... Step: 2560... Loss: 0.093245...
Epoch: 48/100... Step: 2816... Loss: 0.086738...
Epoch: 53/100... Step: 3072... Loss: 0.097762...
Epoch: 57/100... Step: 3328... Loss: 0.101385...
Epoch: 61/100... Step: 3584... Loss: 0.105967...
Epoch: 66/100... Step: 3840... Loss: 0.102142...
Epoch: 70/100... Step: 4096... Loss: 0.103135...
Epoch: 74/100... Step: 4352... Loss: 0.091270...
Epoch: 79/100... Step: 4608... Loss: 0.107417...
Epoch: 83/100... Step: 4864... Loss: 0.109148...
Epoch: 87/100... Step: 5120... Loss: 0.104011...
Epoch: 92/100... Step: 53

In [27]:
# save the model
torch.save(model.state_dict(), "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/LSTM_0.pt")

In [28]:
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    fetched_labels = labels.data.tolist()
    fetched_output = output.squeeze().data.tolist()
    #print(len(fetched_labels)) # 32
    #print(len(fetched_output[0])) # 5

    # absolute mean average; different from the cross entropy used in training
    denominator = len(output)
    for i in range(denominator):
      diff = abs( fetched_labels[i] - fetched_output[i] )
      if diff <= 0.05: # a difference that is small
        num_correct += 1
      test_losses.append(diff)

print(test_losses)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

[0.12320338884989424, 0.14716172218322754, 0.007352166705661356, 0.005368039721534351, 0.024393865040370444, 0.02471327781677246, 0.06903672218322754, 0.05065436924205102, 0.006536722183227539, 0.11846327781677246, 0.017900358546863893, 0.08466172218322754, 0.11764783329433876, 0.03431449996100533, 0.07158827781677246, 0.14542561107211632, 0.012694047047541651, 0.06209227773878312, 0.16903672218322752, 0.039870055516560865, 0.12418378100675698, 0.03778672218322754, 0.06903672218322754, 0.009088277816772461, 0.19346327781677247, 0.08153672218322755, 0.5343144999610052, 0.06699268958147836, 0.07598116662767196, 0.01132042067391531, 0.006536722183227539, 0.018463277816772455, 0.13153672218322754, 0.2696946169200697, 0.039870055516560865, 0.04601040639375387, 0.02216172218322754, 0.10880944945595483, 0.04601040639375387, 0.1388896633596981, 0.5065367221832275, 0.06012994448343914, 0.07598116662767196, 0.28987005551656086, 0.009088277816772461, 0.09477201630087462, 0.05228680722853718, 0.04

LSTM with GLove

In [29]:
# restart the loader
train_sentences = np.array(train_data['number_sentence'].to_list())
train_labels = np.array(train_data['complexity'].to_list())
test_sentences = np.array(test_data['number_sentence'].to_list())
test_labels = np.array(test_data['complexity'].to_list())

training = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
testing = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

batch_size = 128

train_loader = DataLoader(training, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(testing, shuffle=True, batch_size=batch_size, drop_last=True)

In [30]:
def create_weights_matrix_tensor(target_vocab, dimension=100):
  """ create a matrix containing vectors for each word in glove """
  matrix_len = len(target_vocab)
  weights_matrix = np.zeros((matrix_len, dimension))

  for i, word in enumerate(target_vocab):
      try: 
          weights_matrix[i] = glove[word]
      except KeyError:
          weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, )) # initialize a random vector
  return torch.from_numpy(weights_matrix) # must be a tensor!!

def create_emb_layer(weights_matrix, non_trainable=False):
  """ an embedding layer """
  num_embeddings, embedding_dim = weights_matrix.size()
  emb_layer = nn.Embedding(num_embeddings, embedding_dim)
  emb_layer.load_state_dict({'weight': weights_matrix})
  if non_trainable:
    emb_layer.weight.requires_grad = False

  return emb_layer, num_embeddings, embedding_dim

In [31]:
# The model; inherits from the previous model
class ComplexityNetGlove(ComplexityNet):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, weights_matrix, drop_prob=0.5):
        super(ComplexityNetGlove, self).__init__(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True) # use the Glove
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        #self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_size)

In [32]:
vocab_size = len(word2index) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2
weights_matrix = create_weights_matrix_tensor(word2index.keys())

model = ComplexityNetGlove(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, weights_matrix)
model.to(device)
print(model)

lr=0.01
criterion = nn.L1Loss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 100
counter = 0
print_every = 256
clip = 5

ComplexityNetGlove(
  (embedding): Embedding(14826, 100)
  (lstm): LSTM(100, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=1, bias=True)
)


In [33]:
# training
model.train()
for i in range(epochs):
  h = model.init_hidden(batch_size)
    
  for inputs, labels in train_loader:
    counter += 1
    h = tuple([e.data for e in h])
    inputs, labels = inputs.to(device), labels.to(device)
    model.zero_grad()
    output, h = model(inputs, h)
    # cross entropy for multiple classes
    # output is of shape 64 * 5 while labels is of shape 64
    loss = criterion(output, labels) 
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()

    if counter%print_every == 0:
      print("Epoch: {}/{}...".format(i+1, epochs), "Step: {}...".format(counter), "Loss: {:.6f}...".format(loss.item()))

  return F.l1_loss(input, target, reduction=self.reduction)


Epoch: 5/100... Step: 256... Loss: 0.093580...
Epoch: 9/100... Step: 512... Loss: 0.119150...
Epoch: 14/100... Step: 768... Loss: 0.101767...
Epoch: 18/100... Step: 1024... Loss: 0.102664...
Epoch: 22/100... Step: 1280... Loss: 0.116188...
Epoch: 27/100... Step: 1536... Loss: 0.102156...
Epoch: 31/100... Step: 1792... Loss: 0.106548...
Epoch: 35/100... Step: 2048... Loss: 0.103106...
Epoch: 40/100... Step: 2304... Loss: 0.111678...
Epoch: 44/100... Step: 2560... Loss: 0.109645...
Epoch: 48/100... Step: 2816... Loss: 0.113371...
Epoch: 53/100... Step: 3072... Loss: 0.108595...
Epoch: 57/100... Step: 3328... Loss: 0.098336...
Epoch: 61/100... Step: 3584... Loss: 0.109610...
Epoch: 66/100... Step: 3840... Loss: 0.112114...
Epoch: 70/100... Step: 4096... Loss: 0.118717...
Epoch: 74/100... Step: 4352... Loss: 0.097163...
Epoch: 79/100... Step: 4608... Loss: 0.098568...
Epoch: 83/100... Step: 4864... Loss: 0.099111...
Epoch: 87/100... Step: 5120... Loss: 0.103717...
Epoch: 92/100... Step: 53

In [34]:
# save the model
torch.save(model.state_dict(), "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/LSTM_Glove.pt")

In [35]:
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    fetched_labels = labels.data.tolist()
    fetched_output = output.squeeze().data.tolist()
    #print(len(fetched_labels)) # 32
    #print(len(fetched_output[0])) # 5

    # absolute mean average
    denominator = len(output)
    for i in range(denominator):
      diff = abs( fetched_labels[i] - fetched_output[i] )
      if diff <= 0.05: # a difference that is small
        num_correct += 1
      test_losses.append(diff)

print(test_losses)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

[0.33405596017837524, 0.040944039821624756, 0.04729125429602232, 0.1438852758968578, 0.14655590057373047, 0.07219406962394714, 0.04094401001930237, 0.07219409942626953, 0.054101934558466847, 0.05761070648829145, 0.05280590057373047, 0.05761070648829145, 0.0052297541073390574, 0.00761070648829143, 0.0014703556110984284, 0.09094403982162474, 0.04094401001930237, 0.029039277916862893, 0.08506168688044827, 0.14094403982162473, 0.11906903982162476, 0.13469403982162476, 0.05905598998069761, 0.08405596017837524, 0.08506174648509304, 0.07572256724039717, 0.0007226268450418738, 0.1812780929936303, 0.09063490754679626, 0.07670292959493752, 0.1355265484136694, 0.03405590057373048, 0.24094403982162477, 0.05483292871051365, 0.08856314704531712, 0.07016710109180885, 0.06619881732123234, 0.00761070648829143, 0.07670301900190468, 0.034055960178375255, 0.06132862784645776, 0.16218090057373047, 0.0031735178302315648, 0.040944069623947144, 0.10344401001930237, 0.03718098998069763, 0.04094409942626953, 0.