# 1. Setup

## 1.1 Setup working environment

In [1]:
#to use elmo
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import tensorflow as tf
import torch
print(tf.__version__)

1.15.2


In [3]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [4]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 17.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 60.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 54.7MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


## 1.2 Clean raw data

In [6]:
import pandas as pd

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
TRAIN_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/train/lcp_single_train.tsv"
TEST_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/test-labels/lcp_single_test.tsv"
TRAIN = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/train/lcp_single_train_cleaned.tsv"
TEST = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/test-labels/lcp_single_test_cleaned.tsv"

In [9]:
# read the datasets
# train
with open(TRAIN_RAW, 'r') as f:
  data = f.read()

# need to remove " from the string, otherwise parsing will have problems because some quotas are not closed 
data = data.replace('"', '')

with open(TRAIN, 'w') as f:
  f.write(data)

df = pd.read_csv(TRAIN, sep='\t')

# test
with open(TEST_RAW, 'r') as f:
  data = f.read()

data = data.replace('"', '')

with open(TEST, 'w') as f:
  f.write(data)

test = pd.read_csv(TEST, sep='\t')

In [10]:
# take a look
pd.set_option('display.max_colwidth', None) # show the whole sentence
df.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass.",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,"I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book.",brothers,0.0
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,"The man, the lord of the land, said to us, 'By this I will know that you are honest men: leave one of your brothers with me, and take grain for the famine of your houses, and go your way.",brothers,0.05
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,"Shimei had sixteen sons and six daughters; but his brothers didn't have many children, neither did all their family multiply like the children of Judah.",brothers,0.15
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,He has put my brothers far from me.,brothers,0.263889


In [11]:
test.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3K8CQCU3KE19US5SN890DFPK3SANWR,bible,"But he, beckoning to them with his hand to be silent, declared to them how the Lord had brought him out of the prison.",hand,0.0
1,3Q2T3FD0ON86LCI41NJYV3PN0BW3MV,bible,"If I forget you, Jerusalem, let my right hand forget its skill.",hand,0.197368
2,3ULIZ0H1VA5C32JJMKOTQ8Z4GUS51B,bible,"the ten sons of Haman the son of Hammedatha, the Jew's enemy, but they didn't lay their hand on the plunder.",hand,0.2
3,3BFF0DJK8XCEIOT30ZLBPPSRMZQTSD,bible,"Let your hand be lifted up above your adversaries, and let all of your enemies be cut off.",hand,0.267857
4,3QREJ3J433XSBS8QMHAICCR0BQ1LKR,bible,"Abimelech chased him, and he fled before him, and many fell wounded, even to the entrance of the gate.",entrance,0.0


## 1.3 Preprocess cleaned data

In [12]:
import torchtext.vocab

In [13]:
# use the Glove 6B 100d
cache_dir = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/data"
# glove = vocab.pretrained_aliases["glove.6B.100d"](cache=cache_dir)
glove = torchtext.vocab.GloVe(name='6B', dim=100, cache=cache_dir)

In [14]:
# get all the non-unique tokens for prediction
tokens = df['token'].dropna().to_list()
tokens = [token.lower() for token in tokens] # lowercase
print(len(tokens))

# check if all tokens are in Glove
for token in tokens:
  if token not in glove.stoi:
    print("Token Not Found:", token)

7659
Token Not Found: perverseness
Token Not Found: perverseness
Token Not Found: perverseness
Token Not Found: housetops
Token Not Found: slanderers
Token Not Found: plowmen
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dunghill
Token Not Found: carotids
Token Not Found: tace


### 1.3.1 Create word frequency & word length features for each token

In [15]:
import string
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from collections import Counter


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
# create a dataframe for handcrafted features: word length & word frequency
train_df = pd.DataFrame(tokens, columns =['token'])

# add back complexity
train_df['complexity'] = df['complexity']

# word length
train_df['word_length'] = train_df['token'].map(lambda x: len(x))

# punctuations
punc = string.punctuation

# stop words
stop_words = set(stopwords.words('english')) 

# word frequency
# tokenize the whole curpus
temp = df['sentence'].to_list()
texts = []
for sent in temp:
  sent = sent.lower()
  sent = ''.join([c for c in sent if c not in punc])
  words = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
  texts += words
# count frequency
count = Counter(texts)
train_df['word_frequency'] = train_df['token'].map(lambda x: count[x])

train_df.head()

Unnamed: 0,token,complexity,word_length,word_frequency
0,river,0.0,5,26
1,brothers,0.0,8,36
2,brothers,0.05,8,36
3,brothers,0.15,8,36
4,brothers,0.263889,8,36


The `texts` variable is created from combining all the tokens from every corpus in the training data, which is what we used to calculate `word_frequency` feature. **However**, the sentence shown in each row is only a part of the whole corpus; therefore, `word_frequency` is not a reliable feature to be used.

In [21]:
len(texts)

107823

### 1.3.2 Create `word_frequency` from real text

#### [Bible WEB](https://github.com/scrollmapper/bible_databases/tree/master/txt/WEB)

In [17]:
biblePath = r"/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/bibleWEB"

In [18]:
import os
from os.path import isfile, join
from os import listdir
bibleList = sorted([f for f in listdir(biblePath) if isfile(join(biblePath, f))])

In [19]:
bibleList[0]

'1 Genesis - World English Bible (WEB).txt'

In [20]:
f = open(os.path.join(biblePath, bibleList[0]), 'r')
text = f.read().lower()

In [21]:
def bible_tokenizer(chapter: string):
  punc = string.punctuation
  stop_words = set(stopwords.words('english')) 
  sents = []
  for s in chapter.lower().split('\n'):
    sent = ''.join([c for c in s if c not in punc])
    words = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
    sents += words
  return sents

#### Bible word counter

In [22]:
bible_tokens = []
for chapter in bibleList:
  text = open(os.path.join(biblePath, bibleList[0]), 'r')
  tokenized_chapter = bible_tokenizer(text.read())
  bible_tokens += tokenized_chapter

bible_counter = Counter(bible_tokens)

### 1.3.3 ELMo features

In [17]:
import tensorflow_hub as hub
import tensorflow as tf

In [18]:
tf.__version__

'1.15.2'

In [19]:
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [20]:
test_sent = df.head(1).values[0, 2]
test_word = df.head(1).values[0, 3]
print('Sentence:', test_sent, '\n', 'Word:', test_word)

Sentence: Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass. 
 Word: river


In [21]:
test_sent2 = df.head(2).values[1, 2]
test_word2 = df.head(2).values[1, 3]
print('Sentence:', test_sent2, '\n', 'Word:', test_word2)
df.head(2)

Sentence: I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book. 
 Word: brothers


Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass.",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,"I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book.",brothers,0.0


In [77]:
def tokenizer(sent):
  punc = string.punctuation
  s = ''.join([c for c in sent.lower() if c not in punc])
  words = [word for word in s.split(' ') if (word.isalpha() and word not in stop_words)]
  return words

In [156]:
def find_index(s, word):
  word = word.lower()
  if (word in s):
    return s.index(word)

  
  for w in range(len(s)):
    if (word in s[w]):
      return w
  return -1

In [70]:
 def list_index(sent, word):
  punc = string.punctuation
  s = ''.join([c for c in sent.lower() if c not in punc])
  return s.split(' ').index(word.lower())

In [100]:
test_index = list_index(test_sent, test_word)
print(test_index)
print(tokenizer(test_sent))

7
['behold', 'came', 'river', 'seven', 'cattle', 'sleek', 'fat', 'fed', 'marsh', 'grass']


In [126]:
test_index2 = tokenizer(test_sent2).index(test_word2)
print(test_index2)
print(tokenizer(test_sent2))
tokenized_test_sent2 = tokenizer(test_sent2) + [""]*3
print(tokenized_test_sent2)

2
['fellow', 'bondservant', 'brothers', 'prophets', 'keep', 'words', 'book']
['fellow', 'bondservant', 'brothers', 'prophets', 'keep', 'words', 'book', '', '', '']


In [128]:
test_embedding = elmo({"tokens": [tokenizer(test_sent), tokenized_test_sent2], "sequence_len": [10, 7]}, signature='tokens', as_dict=True)['elmo']
print('Run session...')
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    test_vectors = sess.run(test_embedding)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...


In [None]:
# def compute_elmo_tokens(tokens, sequence_len):
#   embedding = elmo({"tokens": tokens, "sequence_len": sequence_len}, signature='tokens', as_dict=True)['elmo']
#   print('Run session...')
#   with tf.Session() as sess:
#       sess.run(tf.global_variables_initializer())
#       sess.run(tf.tables_initializer())
#       # return average of ELMo features
#       vectors = sess.run(embedding)
#   return vectors

In [24]:
#extract elmo vectors
def elmo_vectors_word(x: string):
  embeddings = elmo([x], signature='default', as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(embeddings)

In [54]:
test_elmo = elmo_vectors_word(test_sent, 'default')
test_elmo[0, test_index]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [157]:
def clean_for_elmo(sents, words):
  raw_tokens = []
  sequence_len = []
  index_tokens = []
  for i in range(len(sents)):
    tokenized = tokenizer(sents[i]) #return as str[]
    raw_tokens += [tokenized]

    sequence_len.append(len(tokenized))
    index = find_index(tokenized, words[i])
    if (index == -1):
      print('-------------')
      print(i)
      print(sents[i])
      print(tokenized)
      print(words[i])
      print('COULD NOT FIND WORD!!')
    index_tokens.append(index)


  max_len = max(sequence_len)
  tokens = []
  for t in raw_tokens:
    if (len(t) < max_len):
      dif = max_len-len(t)
      tokens += [t + [""]*(dif)]
    else:
      tokens += [t]
  return tokens, sequence_len, index_tokens

In [51]:
bible_df = df[df['corpus'] == 'bible']

In [147]:
bible_df.shape

(2574, 5)

In [160]:
#compute only the first half first for lower computing time
bible_sents = bible_df.iloc[:800]['sentence'].tolist()
bible_words = bible_df.iloc[:800]['token'].tolist()

In [161]:
tokens_elmo, seq_len_elmo, index_token = clean_for_elmo(bible_sents, bible_words)

In [162]:
elmo_vectors_first = compute_elmo_tokens(tokens_elmo, seq_len_elmo)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...


In [267]:
def compute_elmo(batch_size=800):
  elmo_vectors = []
  index_all_token = []
  bible_sents = bible_df.iloc[:batch_size]['sentence'].tolist()
  bible_words = bible_df.iloc[:batch_size]['token'].tolist()
  new_batch_size = batch_size
  while (new_batch_size < bible_df.shape[0]):
    tokens_elmo, seq_len_elmo, index_token = clean_for_elmo(bible_sents, bible_words)
    elmo_vectors_batch = compute_elmo_tokens(tokens_elmo, seq_len_elmo)
    elmo_vectors.append(elmo_vectors_batch)
    index_all_token.append(index_token)
    old_batch_size = new_batch_size
    print(old_batch_size)
    new_batch_size += batch_size
    bible_sents = bible_df.iloc[old_batch_size:new_batch_size]['sentence'].tolist()
    bible_words = bible_df.iloc[old_batch_size:new_batch_size]['token'].tolist()

# disregard the rest 174

  # bible_sents = bible_df.iloc[old_batch_size:]['sentence'].tolist()
  # bible_words = bible_df.iloc[old_batch_size:]['token'].tolist()
  # tokens_elmo, seq_len_elmo, index_token = clean_for_elmo(bible_sents, bible_words)
  # elmo_vectors_batch = compute_elmo_tokens(tokens_elmo, seq_len_elmo)
  # elmo_vectors.append(elmo_vectors_batch)
  # index_all_token.append(index_token)

  return elmo_vectors, index_all_token

In [268]:
elmo_vectors, index_token = compute_elmo()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
800
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1600
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
2400


In [269]:
print(len(elmo_vectors))
print(len(index_token))

3
3


#### ELMO word embedding

In [270]:
elmo_embedded_word = []
for j in range(len(elmo_vectors)):
  for i in range(len(elmo_vectors[j])):
      elmo_embedded_word.append(elmo_vectors_first[i, index_token[j][i]])

#2. Train models

## 2.1 Linear regression with new `word_frequency`

### Bible text

In [None]:
bible_df = df[df['corpus'] == 'bible']
print(bible_df.shape)

(2574, 5)


In [None]:
bible_df.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass.",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,"I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book.",brothers,0.0
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,"The man, the lord of the land, said to us, 'By this I will know that you are honest men: leave one of your brothers with me, and take grain for the famine of your houses, and go your way.",brothers,0.05
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,"Shimei had sixteen sons and six daughters; but his brothers didn't have many children, neither did all their family multiply like the children of Judah.",brothers,0.15
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,He has put my brothers far from me.,brothers,0.263889


In [None]:
# get all the non-unique tokens for prediction
tokens = df[df['corpus'] == 'bible']['token'].dropna().to_list()
tokens = [token.lower() for token in tokens] # lowercase

# create a dataframe for handcrafted features: word length & word frequency
bible_train_df = pd.DataFrame(tokens, columns =['token'])

# add back complexity
bible_train_df['complexity'] = df['complexity']

# word length & word frequency
bible_train_df['word_length'] = bible_train_df['token'].map(lambda x: len(x))
bible_train_df['word_frequency'] = bible_train_df['token'].map(lambda x: bible_counter[x])

In [None]:
bible_train_df.head()

Unnamed: 0,token,complexity,word_length,word_frequency
0,river,0.0,5,1056
1,brothers,0.0,8,5346
2,brothers,0.05,8,5346
3,brothers,0.15,8,5346
4,brothers,0.263889,8,5346


### create weight matrix

In [None]:
import numpy as np

In [None]:
def create_weights_matrix(vocab, dimension=100):
  """ create a matrix containing vectors for each word in Glove """
  matrix_len = len(vocab)
  weights_matrix = np.zeros((matrix_len, dimension))

  for i, word in enumerate(vocab):
      try: 
          weights_matrix[i] = glove[word]
      except KeyError:
          weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, )) # initialize a random vector
  #return torch.from_numpy(weights_matrix) # a tensor
  return weights_matrix

In [None]:
# create the weight matrix
weight_matrix = create_weights_matrix(tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)

train_df_combined = pd.concat([bible_train_df, weight_matrix_df], axis=1)
train_df_combined.head()

(2574, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,river,0.0,5,1056,-0.33249,-0.56631,0.54255,-0.11869,0.53129,-0.49381,0.64114,0.85982,0.39633,-1.5395,-0.30613,0.97267,-0.31192,-0.10311,0.35951,-0.60023,0.90983,-0.95954,-0.55375,0.082818,0.26711,0.64645,-0.098556,0.53924,-0.2181,-0.1343,-1.807,-0.14879,0.39006,-0.62883,-0.38825,0.31925,0.77853,-0.60273,0.063585,-0.75916,...,-0.53185,0.72585,0.36811,0.19494,0.64276,0.8146,0.26748,-0.39275,0.42595,0.11699,0.21063,-0.061747,0.79298,-0.45978,0.85176,-0.36726,0.11816,0.50416,-0.065352,0.69672,0.37525,0.92586,-0.83036,-0.087948,-0.49715,0.21411,-0.82838,-0.85912,0.61576,1.188,-0.30745,-1.2009,-1.7097,0.514,-1.0159,0.55555,-1.0385,-0.6994,1.0506,0.24051
1,brothers,0.0,8,5346,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
2,brothers,0.05,8,5346,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
3,brothers,0.15,8,5346,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
4,brothers,0.263889,8,5346,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982


### train model

In [140]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

In [None]:
# get data for training
X_train = train_df_combined.drop(columns=['token', 'complexity'])
Y_train = train_df_combined['complexity']

In [None]:
# train linear regression
lr = LinearRegression().fit(X_train, Y_train)

In [None]:
# predict
Y_pred = lr.predict(X_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.06970450197025566


### test

In [None]:
# on test
test_tokens = test[test['corpus']=='bible']['token'].dropna().to_list()
test_tokens = [token.lower() for token in test_tokens] # lowercase
print(len(test_tokens))

# create a dataframe for linear regression
test_df = pd.DataFrame(test_tokens, columns =['token'])

test_df['complexity'] = test['complexity']

# word length
test_df['word_length'] = test_df['token'].map(lambda x: len(x))
test_df['word_frequency'] = test_df['token'].map(lambda x: bible_counter[x])

283


In [None]:
test_df.head()

Unnamed: 0,token,complexity,word_length,word_frequency
0,hand,0.0,4,5940
1,hand,0.197368,4,5940
2,hand,0.2,4,5940
3,hand,0.267857,4,5940
4,entrance,0.0,8,0


In [None]:
# create the weight matrix
weight_matrix = create_weights_matrix(test_tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)
test_df_combined = pd.concat([test_df, weight_matrix_df], axis=1)
test_df_combined.head()

(283, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,hand,0.0,4,5940,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
1,hand,0.197368,4,5940,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
2,hand,0.2,4,5940,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
3,hand,0.267857,4,5940,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
4,entrance,0.0,8,0,0.25776,0.1068,-0.16265,0.42335,0.19078,0.46283,-0.95915,0.93174,0.47161,0.39077,0.54734,0.41967,0.086822,0.53954,0.35497,-0.028346,0.42708,0.036569,-0.497,-0.49543,-0.031232,-0.30298,-0.41718,-0.78459,0.70473,-0.59741,-0.33173,-0.38813,0.17189,-0.78565,-0.17219,-0.14019,0.61492,0.5713,0.75109,-0.015942,...,-0.60393,0.47454,0.80912,0.81709,-0.12876,-0.3931,0.17656,-0.29797,-0.32614,-0.26522,-0.37006,-0.016956,0.92268,-0.71606,-0.38524,-0.085737,0.68111,0.3208,0.4587,-0.82737,0.22932,0.3145,-0.21221,-0.65293,-0.31427,-0.037493,0.16126,-0.46719,0.63066,0.26426,0.52778,-0.34505,0.0662,0.7224,-0.11057,-0.005771,-0.059336,0.013272,0.97305,0.45405


In [None]:
# get data for test
X_test = test_df_combined.drop(columns=['token', 'complexity'])
Y_test = test_df_combined['complexity']

# predict
Y_pred = lr.predict(X_test)

# test loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_test[i])
  losses.append(loss)
abl = sum(losses) / num
print("average test absolute loss is " + str(abl))

average test absolute loss is 0.07546507119733206


## 2.2 Linear regression with ELMo word embedding

In [271]:
len(elmo_embedded_word)

2400

In [272]:
elmo_bible_df = bible_df.iloc[:len(elmo_embedded_word)]

In [273]:
len(elmo_bible_df)

2400

In [274]:
elmo_embedded_df = pd.DataFrame(elmo_embedded_word)

In [275]:
elmo_embedded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.380389,-0.176156,-0.314543,-0.501822,-0.090818,0.105634,-0.251023,-0.066272,0.243032,-0.761172,-0.291393,0.064379,-0.453321,-0.423276,0.816208,-0.353879,0.408419,0.122126,0.496743,0.353603,-0.351386,-0.922783,-0.209045,-0.359489,0.711408,-0.171895,-0.333597,0.793782,-0.901847,0.509671,0.247244,0.280223,0.268059,-0.539888,-0.103087,0.29116,0.992129,0.264218,-0.22455,0.470073,...,0.257131,-1.014234,0.116351,0.884804,0.771221,-0.382885,-0.389934,-0.064094,0.188469,-0.25247,-0.016003,-0.283068,-0.100671,0.20486,0.546275,0.513434,0.09439,0.732664,-0.115683,0.189742,0.107118,0.150636,-0.038369,-0.243636,-0.724234,-0.076423,0.054118,0.113401,0.372316,0.30305,-0.009343,0.357153,0.287225,0.254209,-0.150872,-0.407083,-0.230057,0.821129,0.205928,0.648316
1,0.080406,-0.40823,0.158734,-0.079608,0.255208,-0.493177,0.061747,0.495774,0.45792,0.361132,-0.256218,0.261946,0.350431,0.18085,0.36866,0.22267,0.221042,0.00736,0.40547,-0.55929,-0.000919,-0.280063,-0.229275,0.055031,0.096481,-0.122666,0.458498,0.230201,-0.23404,0.323799,-0.577974,-0.23127,0.405233,-0.492053,-0.262013,-0.562029,-0.754784,0.114774,-0.30307,0.004179,...,0.9218,0.091345,0.728693,1.06949,-0.089532,-0.126689,-0.05313,0.416477,0.294667,0.006618,0.328412,-0.557811,0.253493,-0.354477,0.3329,0.875495,-1.043192,1.28843,-0.056559,0.187961,0.337483,-0.090544,0.276176,-0.393747,-0.89701,0.273594,0.059733,0.252738,0.207929,-0.025753,0.021567,1.168533,0.562797,-0.284247,-0.291426,0.656828,0.652902,-0.680609,0.406645,-0.072077
2,0.122995,-0.255338,-0.08461,-0.139912,0.065407,0.552847,-0.104534,0.109842,0.90361,0.451307,-0.27547,0.095294,0.099199,-0.228622,0.185442,0.079517,0.633629,-0.238946,0.044196,-0.7192,0.28101,-0.201918,-0.225704,-0.157153,-0.377018,-0.045188,0.433793,0.688715,-0.552029,0.3779,-0.080042,-0.302301,-0.597616,-0.443838,-0.02264,-0.568669,-0.807511,0.083843,0.715118,0.486972,...,0.996324,-0.098625,0.389538,1.594374,-0.013069,-0.45342,-0.605789,-0.173979,0.04237,-0.014586,0.302771,-0.32325,0.146018,-0.50915,0.194316,0.767718,-0.213173,0.564365,-0.042883,0.149758,0.152589,-0.061916,0.154927,-0.081459,-0.231791,-0.260478,0.171444,0.504945,0.628526,-0.431658,0.408605,1.460399,0.662861,-0.136868,-0.080981,0.412476,0.408365,-0.432835,0.264197,-0.065345
3,-0.233317,0.075169,0.101757,-0.255629,0.177177,0.213724,-0.04996,0.879802,0.228996,0.333588,-0.258793,0.213695,0.08974,-0.462861,0.45921,0.342702,0.284974,-0.314406,0.142398,-0.881451,0.01107,-0.360886,-0.327134,-0.118879,0.195584,-0.184107,0.189582,-0.031402,-0.119926,0.90529,-0.042242,-0.078217,0.774182,-0.607643,0.057612,-0.598457,-1.372679,0.186787,0.108967,0.402878,...,0.939276,0.299659,0.282591,1.413206,-0.172405,0.39143,-0.400962,-0.049452,-0.075253,-0.030729,0.53845,-0.235912,0.223105,-0.138607,-0.543708,0.693104,-0.676663,0.191056,0.044299,0.051567,0.243266,0.239512,0.250766,-0.325057,-0.579753,-0.275222,0.146736,0.051135,0.469655,-0.196534,-0.151259,1.477482,0.264394,0.07645,-0.087113,0.556019,0.54103,-0.189591,0.507491,0.203926
4,0.045292,-0.214803,0.358676,-0.601393,-0.04868,0.365268,-0.550482,0.514303,0.559915,0.367665,-0.120503,0.459812,0.799864,0.429512,0.348145,0.705791,-0.132481,-0.198132,0.894733,-0.93105,0.008356,-0.012287,0.185829,-0.012477,-0.029897,-0.106955,0.921338,0.438324,-0.385043,0.210029,-0.644655,-0.322156,0.171714,-0.011876,-0.427351,-0.764196,-0.182957,0.413742,-0.472179,0.365791,...,0.603303,-0.00274,-0.036127,1.148385,-0.468835,-0.008,-0.349761,0.062258,0.036226,0.254457,0.521911,-0.143204,0.411426,-0.176269,0.035457,0.314054,-0.594687,0.450245,0.020033,-0.200571,0.566393,0.079581,0.120403,-0.056354,-0.183325,-0.331903,-0.132743,0.091287,0.731343,-0.135945,-0.014588,1.263425,0.616776,-0.136729,-0.131905,0.233556,0.458229,-0.104712,0.049443,0.220563


In [276]:
elmo_bible_df_combined = pd.concat([elmo_bible_df, elmo_embedded_df], axis=1).dropna()
elmo_bible_df_combined.head(2)

Unnamed: 0,id,corpus,sentence,token,complexity,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass.",river,0.0,0.380389,-0.176156,-0.314543,-0.501822,-0.090818,0.105634,-0.251023,-0.066272,0.243032,-0.761172,-0.291393,0.064379,-0.453321,-0.423276,0.816208,-0.353879,0.408419,0.122126,0.496743,0.353603,-0.351386,-0.922783,-0.209045,-0.359489,0.711408,-0.171895,-0.333597,0.793782,-0.901847,0.509671,0.247244,0.280223,0.268059,-0.539888,-0.103087,...,0.257131,-1.014234,0.116351,0.884804,0.771221,-0.382885,-0.389934,-0.064094,0.188469,-0.25247,-0.016003,-0.283068,-0.100671,0.20486,0.546275,0.513434,0.09439,0.732664,-0.115683,0.189742,0.107118,0.150636,-0.038369,-0.243636,-0.724234,-0.076423,0.054118,0.113401,0.372316,0.30305,-0.009343,0.357153,0.287225,0.254209,-0.150872,-0.407083,-0.230057,0.821129,0.205928,0.648316
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,"I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book.",brothers,0.0,0.080406,-0.40823,0.158734,-0.079608,0.255208,-0.493177,0.061747,0.495774,0.45792,0.361132,-0.256218,0.261946,0.350431,0.18085,0.36866,0.22267,0.221042,0.00736,0.40547,-0.55929,-0.000919,-0.280063,-0.229275,0.055031,0.096481,-0.122666,0.458498,0.230201,-0.23404,0.323799,-0.577974,-0.23127,0.405233,-0.492053,-0.262013,...,0.9218,0.091345,0.728693,1.06949,-0.089532,-0.126689,-0.05313,0.416477,0.294667,0.006618,0.328412,-0.557811,0.253493,-0.354477,0.3329,0.875495,-1.043192,1.28843,-0.056559,0.187961,0.337483,-0.090544,0.276176,-0.393747,-0.89701,0.273594,0.059733,0.252738,0.207929,-0.025753,0.021567,1.168533,0.562797,-0.284247,-0.291426,0.656828,0.652902,-0.680609,0.406645,-0.072077


In [277]:
X_elmo_train = elmo_bible_df_combined.drop(columns=['token', 'complexity', 'id', 'corpus', 'sentence'])
Y_elmo_train = elmo_bible_df_combined['complexity']

In [278]:
#train
lr = LinearRegression().fit(X_elmo_train, Y_elmo_train)

In [279]:
# predict
Y_elmo_pred = lr.predict(X_elmo_train)

In [280]:
# train loss (average absolute loss)
num = len(Y_elmo_pred)
losses = []
for i in range(num):
  loss = abs(Y_elmo_pred[i] - Y_elmo_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.06452791414804292
