by Thachathum Amornkasemwong

**github:** toucham

# 1. Setup

## 1.1 Setup working environment

In [None]:
#to use elmo
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
import tensorflow as tf
import torch
print(tf.__version__)

1.15.2


In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 16.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 54.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 47.4MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


## 1.2 Clean raw data

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
TRAIN_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/train/lcp_single_train.tsv"
TEST_RAW = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/test-labels/lcp_single_test.tsv"
TRAIN = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/train/lcp_single_train_cleaned.tsv"
TEST = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/test-labels/lcp_single_test_cleaned.tsv"

In [None]:
# read the datasets
# train
with open(TRAIN_RAW, 'r') as f:
  data = f.read()

# need to remove " from the string, otherwise parsing will have problems because some quotas are not closed 
data = data.replace('"', '')

with open(TRAIN, 'w') as f:
  f.write(data)

df = pd.read_csv(TRAIN, sep='\t')

# test
with open(TEST_RAW, 'r') as f:
  data = f.read()

data = data.replace('"', '')

with open(TEST, 'w') as f:
  f.write(data)

test = pd.read_csv(TEST, sep='\t')

In [None]:
# take a look
pd.set_option('display.max_colwidth', None) # show the whole sentence
df.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass.",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,"I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book.",brothers,0.0
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,"The man, the lord of the land, said to us, 'By this I will know that you are honest men: leave one of your brothers with me, and take grain for the famine of your houses, and go your way.",brothers,0.05
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,"Shimei had sixteen sons and six daughters; but his brothers didn't have many children, neither did all their family multiply like the children of Judah.",brothers,0.15
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,He has put my brothers far from me.,brothers,0.263889


In [None]:
test.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3K8CQCU3KE19US5SN890DFPK3SANWR,bible,"But he, beckoning to them with his hand to be silent, declared to them how the Lord had brought him out of the prison.",hand,0.0
1,3Q2T3FD0ON86LCI41NJYV3PN0BW3MV,bible,"If I forget you, Jerusalem, let my right hand forget its skill.",hand,0.197368
2,3ULIZ0H1VA5C32JJMKOTQ8Z4GUS51B,bible,"the ten sons of Haman the son of Hammedatha, the Jew's enemy, but they didn't lay their hand on the plunder.",hand,0.2
3,3BFF0DJK8XCEIOT30ZLBPPSRMZQTSD,bible,"Let your hand be lifted up above your adversaries, and let all of your enemies be cut off.",hand,0.267857
4,3QREJ3J433XSBS8QMHAICCR0BQ1LKR,bible,"Abimelech chased him, and he fled before him, and many fell wounded, even to the entrance of the gate.",entrance,0.0


## 1.3 Preprocess cleaned data

In [None]:
import torchtext.vocab

In [None]:
# use the Glove 6B 100d
cache_dir = "/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/data"
# glove = vocab.pretrained_aliases["glove.6B.100d"](cache=cache_dir)
glove = torchtext.vocab.GloVe(name='6B', dim=100, cache=cache_dir)

In [None]:
# get all the non-unique tokens for prediction
tokens = df['token'].dropna().to_list()
tokens = [token.lower() for token in tokens] # lowercase
print(len(tokens))

# check if all tokens are in Glove
for token in tokens:
  if token not in glove.stoi:
    print("Token Not Found:", token)

7659
Token Not Found: perverseness
Token Not Found: perverseness
Token Not Found: perverseness
Token Not Found: housetops
Token Not Found: slanderers
Token Not Found: plowmen
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dainties
Token Not Found: dunghill
Token Not Found: carotids
Token Not Found: tace


### 1.3.1 Create word frequency & word length features for each token

In [None]:
import string
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from collections import Counter


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# create a dataframe for handcrafted features: word length & word frequency
train_df = pd.DataFrame(tokens, columns =['token'])

# add back complexity
train_df['complexity'] = df['complexity']

# word length
train_df['word_length'] = train_df['token'].map(lambda x: len(x))

# punctuations
punc = string.punctuation

# stop words
stop_words = set(stopwords.words('english')) 

# word frequency
# tokenize the whole curpus
temp = df['sentence'].to_list()
texts = []
for sent in temp:
  sent = sent.lower()
  sent = ''.join([c for c in sent if c not in punc])
  words = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
  texts += words
# count frequency
count = Counter(texts)
train_df['word_frequency'] = train_df['token'].map(lambda x: count[x])

train_df.head()

Unnamed: 0,token,complexity,word_length,word_frequency
0,river,0.0,5,26
1,brothers,0.0,8,36
2,brothers,0.05,8,36
3,brothers,0.15,8,36
4,brothers,0.263889,8,36


The `texts` variable is created from combining all the tokens from every corpus in the training data, which is what we used to calculate `word_frequency` feature. **However**, the sentence shown in each row is only a part of the whole corpus; therefore, `word_frequency` is not a reliable feature to be used.

In [None]:
len(texts)

107823

### 1.3.2 Create `word_frequency` from real text

#### [Bible WEB](https://github.com/scrollmapper/bible_databases/tree/master/txt/WEB)

In [None]:
biblePath = r"/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/bibleWEB"

In [None]:
import os
from os.path import isfile, join
from os import listdir
bibleList = sorted([f for f in listdir(biblePath) if isfile(join(biblePath, f))])

In [None]:
bibleList[0]

'1 Genesis - World English Bible (WEB).txt'

In [None]:
f = open(os.path.join(biblePath, bibleList[0]), 'r')
text = f.read().lower()

In [None]:
def bible_tokenizer(chapter: string):
  punc = string.punctuation
  stop_words = set(stopwords.words('english')) 
  sents = []
  for s in chapter.lower().split('\n'):
    sent = ''.join([c for c in s if c not in punc])
    words = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
    sents += words
  return sents

#### Bible word counter

In [None]:
bible_tokens = []
for chapter in bibleList:
  text = open(os.path.join(biblePath, chapter), 'r')
  tokenized_chapter = bible_tokenizer(text.read())
  bible_tokens += tokenized_chapter

bible_counter = Counter(bible_tokens)

#### [CRAFT Corpus](https://github.com/UCDenver-ccp/CRAFT/tree/master/articles/txt)

In [None]:
craftPath = r"/content/gdrive/My Drive/Colab Notebooks/CS505FinalProject/CRAFTtxt"

In [None]:
import os
from os.path import isfile, join
from os import listdir
craftList = [f for f in listdir(craftPath) if isfile(join(craftPath, f))]

In [None]:
craftList[0]

'15676071.txt'

In [None]:
f = open(os.path.join(craftPath, craftList[0]), 'r')
text = f.read().lower()

In [None]:
def craft_tokenizer(text: string):
  punc = string.punctuation
  stop_words = set(stopwords.words('english')) 
  sents = []
  for s in text.lower().split('\n'):
    sent = ''.join([c for c in s if c not in punc])
    words = [word for word in sent.split(' ') if (word.isalpha() and word not in stop_words)]
    sents += words
  return sents

#### Craft word counter

In [None]:
craft_tokens = []
for chapter in craftList:
  text = open(os.path.join(craftPath, chapter), 'r')
  tokenized_chapter = craft_tokenizer(text.read())
  craft_tokens += tokenized_chapter

craft_counter = Counter(craft_tokens)

### 1.3.3 ELMo features

In [None]:
import tensorflow_hub as hub
import tensorflow as tf

In [None]:
stop_words = set(stopwords.words('english')) 

In [None]:
tf.__version__

'1.15.2'

In [None]:
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [None]:
test_sent = df.head(1).values[0, 2]
test_word = df.head(1).values[0, 3]
print('Sentence:', test_sent, '\n', 'Word:', test_word)

Sentence: Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass. 
 Word: river


In [None]:
test_sent2 = df.head(2).values[1, 2]
test_word2 = df.head(2).values[1, 3]
print('Sentence:', test_sent2, '\n', 'Word:', test_word2)
df.head(2)

Sentence: I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book. 
 Word: brothers


Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven c...",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,I am a fellow bondservant with you and with yo...,brothers,0.0


In [None]:
def tokenizer(sent):
  punc = string.punctuation
  s = ''.join([c for c in sent.lower() if c not in punc])
  words = [word for word in s.split(' ') if (word.isalpha() and word not in stop_words)]
  return words

In [None]:
def find_index(s, word):
  word = word.lower()
  if (word in s):
    return s.index(word)

  
  for w in range(len(s)):
    if (word in s[w]):
      return w
  return -1

In [None]:
def compute_elmo_tokens(tokens, sequence_len):
  embedding = elmo({"tokens": tokens, "sequence_len": sequence_len}, signature='tokens', as_dict=True)['elmo']
  print('Run session...')
  with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())
      sess.run(tf.tables_initializer())
      # return average of ELMo features
      vectors = sess.run(embedding)
  return vectors

In [None]:
def clean_for_elmo(sents, words):
  raw_tokens = []
  sequence_len = []
  index_tokens = []
  for i in range(len(sents)):
    tokenized = tokenizer(sents[i]) #return as str[]
    raw_tokens += [tokenized]

    sequence_len.append(len(tokenized))
    index = find_index(tokenized, words[i])
    if (index == -1):
      print('-------------')
      print(i)
      print(sents[i])
      print(tokenized)
      print(words[i])
      print('COULD NOT FIND WORD!!')
    index_tokens.append(index)


  max_len = max(sequence_len)
  tokens = []
  for t in raw_tokens:
    if (len(t) < max_len):
      dif = max_len-len(t)
      tokens += [t + [""]*(dif)]
    else:
      tokens += [t]
  return tokens, sequence_len, index_tokens

#### Compute bible only

In [None]:
import numpy as np

In [None]:
bible_df = df[df['corpus'] == 'bible']

In [None]:
bible_df.shape

(2574, 5)

In [None]:
def compute_elmo(input_df, batch_size=800):
  elmo_vectors = []
  index_all_token = []
  df_sents = input_df.iloc[:batch_size]['sentence'].tolist()
  df_words = input_df.iloc[:batch_size]['token'].tolist()
  new_batch_size = batch_size
  old_batch_size = batch_size
  while (new_batch_size < input_df.shape[0]):
    tokens_elmo, seq_len_elmo, index_token = clean_for_elmo(df_sents, df_words)
    elmo_vectors_batch = compute_elmo_tokens(tokens_elmo, seq_len_elmo)
    elmo_vectors.append(elmo_vectors_batch)
    index_all_token.append(index_token)
    old_batch_size = new_batch_size
    print(old_batch_size)
    new_batch_size += batch_size
    df_sents = input_df.iloc[old_batch_size:new_batch_size]['sentence'].tolist()
    df_words = input_df.iloc[old_batch_size:new_batch_size]['token'].tolist()

  df_sents = input_df.iloc[old_batch_size:]['sentence'].tolist()
  df_words = input_df.iloc[old_batch_size:]['token'].tolist()
  tokens_elmo, seq_len_elmo, index_token = clean_for_elmo(df_sents, df_words)
  elmo_vectors_batch = compute_elmo_tokens(tokens_elmo, seq_len_elmo)
  elmo_vectors.append(elmo_vectors_batch)
  index_all_token.append(index_token)

  return elmo_vectors, index_all_token

In [None]:
elmo_vectors, index_token = compute_elmo(bible_df)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
800
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1600
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
2400
-------------
158
Lebaoth, Shilhim, Ain, and Rimmon.
['lebaoth', 'shilhim', 'rimmon']
Ain
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...


#### Compute biomed only

In [None]:
biomed_df = df[df['corpus'] == 'biomed'].dropna()

In [None]:
#not enough memory
biomed_elmo_vectors, biomed_index_token = compute_elmo(biomed_df, batch_size=500)

-------------
398
Because there was clearly no MCAD antigen detected in MCAD−/− mice, the residual dehydrogenase activity measured with these two substrates must represent the activity of other chain length–specific acyl-CoA dehydrogenases.
['clearly', 'mcad', 'antigen', 'detected', 'mice', 'residual', 'dehydrogenase', 'activity', 'measured', 'two', 'substrates', 'must', 'represent', 'activity', 'chain', 'acylcoa', 'dehydrogenases']
length
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
500
-------------
218
Does activity-dependent neuronal competition [37] contribute to increased representation of the olfactory receptors that respond to common environmental odorants?
['activitydependent', 'neuronal', 'competition', 'contribute', 'increased', 'representation', 'olfactory', 'receptors', 'respond', 'common', 'environmental', 'odorants']
Does
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1000
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1500
-------------
134
We [5,8], and others [9–17], have recently extended the power of this approach by incorporating genome-wide gene expression array analysis, which allows us to model the “genetics of gene expression” using similar methods.
['others', 'recently', 'extended', 'power', 'approach', 'incorporating', 'genomewide', 'gene', 'expression', 'array', 'analysis', 'allows', 'us', 'model', 'gene', 'using', 'similar', 'methods']
genetics
COULD NOT FIND WORD!!
-------------
314
Our data reveal that a PPARδ-mediated transcriptional pathway can regulate muscle fiber specification, enabling the generation of a strain of mice with a “long-distance running” phenotype.
['data', 'reveal', 'pparδmediated', 'transcriptional', 'pathway', 'regulate', 'muscle', 'fiber', 'specification', 'enabling', 'generation', 'strain', 'mice', 'phenotype']
running
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
2000
-------------
99
Annexin A7, the first annexin to be described, was isolated as the agent that mediated aggregation of chromaffin granules and fusion of membranes and phospholipids in the presence of Ca2+-ions [3].
['annexin', 'first', 'annexin', 'described', 'isolated', 'agent', 'mediated', 'aggregation', 'chromaffin', 'granules', 'fusion', 'membranes', 'phospholipids', 'presence']
ions
COULD NOT FIND WORD!!
-------------
238
MA performed FPLC analysis.
['performed', 'fplc', 'analysis']
MA
COULD NOT FIND WORD!!
-------------
439
Bmp2 and Bmp4 are also expressed in the AER, where Prx1::cre is inactive, and these domains of expression are not affected (Figure 1A–1D, black arrows).
['also', 'expressed', 'aer', 'inactive', 'domains', 'expression', 'affected', 'figure', 'black', 'arrows']
cre
COULD NOT FIND WORD!!
-------------
441
These expression domains are completely lost by E10.5 in the presence of the Prx1::cre transgene (Figure 1D).
['expression', 'domains', 'com

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
2500
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...


#### Compute euraparl only

In [None]:
europarl_df = df[df['corpus'] == 'europarl'].dropna()
europarl_df.shape

(2512, 5)

In [None]:
#not enough memory
europarl_elmo_vectors, europarl_index_token = compute_elmo(europarl_df, batch_size=250)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
250
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
500
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
750
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1000
-------------
65
There is an important Election Observation Mission there with your colleague, Mrs De Keyser; secondly, to underline the importance of European Union financial support to the Palestinian people, and, thirdly, to thus raise the visibility of a number of EU projects, notably in Gaza.
['important', 'election', 'observation', 'mission', 'colleague', 'secondly', 'underline', 'importance', 'european', 'union', 'financial', 'support', 'palestinian', 'people', 'thirdly', 'thus', 'raise', 'visibility', 'number', 'eu', 'projects', 'notably', 'gaza']
Mrs
COULD NOT FIND WORD!!
-------------
66
– the oral question to the Commission by Mrs Berès, on behalf of the Committee on Economic and Monetary Affairs, on the expiry of Directive 1999/85/EC as regards the possibility of applying on an experimental basis a reduced VAT rate on labour-intensive services (O-0106/2005 – B6-0342/2005).
['oral', 'question', 'commission', 'behalf', 'committee', 'economic', 'monetary', 

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1250
-------------
91
I am frustrated because fewer and fewer people now believe the solution lies in the Road Map sponsored by the Quartet.
['frustrated', 'fewer', 'fewer', 'people', 'believe', 'solution', 'lies', 'sponsored', 'quartet']
Map
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1500
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
1750
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
2000
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
2250
-------------
43
Again, let me – in this case in her absence – congratulate Mrs Ries on this appointment and note, pursuant to Rule 8(4), that her mandate as a Member of the European Parliament has terminated and that a vacancy exists as of 12 February 2004.
['let', 'case', 'absence', 'congratulate', 'appointment', 'note', 'pursuant', 'mandate', 'member', 'european', 'parliament', 'terminated', 'vacancy', 'exists', 'february']
Ries
COULD NOT FIND WORD!!
-------------
229
A4-0373/98 by Mrs Waddington, on behalf of the Committee on Employment and Social Affairs, on the proposal for a Council Decision establishing the second phase of the Community vocational training action programme 'Leonardo da Vinci' (COM(98)0330 - C4-0522/98-98/0196(SYN)); -A4-0371/98 by Mrs Pack, on behalf of the Committee on Culture, Youth, Education and the Media, on the proposal for a European Parliament and Council Decision establishing the second phase of the Community action programme in the

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
2500
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...


#### ELMO word embedding

In [None]:
def word_embedding_vectors(elmo_vectors, index_token):
  elmo_embedded_word = []
  for j in range(len(elmo_vectors)):
    for i in range(len(elmo_vectors[j])):
      if (index_token[j][i] == -1):
        #init randomly normal distributed vector for index == -1
        elmo_embedded_word.append(np.random.normal(scale=0.6, size=(1024, )))
      else:
        elmo_embedded_word.append(elmo_vectors[j][i, index_token[j][i]])
  return elmo_embedded_word

In [None]:
elmo_embedded_word = word_embedding_vectors(elmo_vectors, index_token)

In [None]:
elmo_embedded_word_biomed = word_embedding_vectors(biomed_elmo_vectors, biomed_index_token)

In [None]:
elmo_embedded_word_europarl = word_embedding_vectors(europarl_elmo_vectors, europarl_index_token)

#2. Train models

## 2.1 Linear regression with new `word_frequency`

### Bible text

In [None]:
bible_df = df[df['corpus'] == 'bible']
print(bible_df.shape)

(2574, 5)


In [None]:
bible_df.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven cattle, sleek and fat, and they fed in the marsh grass.",river,0.0
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,"I am a fellow bondservant with you and with your brothers, the prophets, and with those who keep the words of this book.",brothers,0.0
2,3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,"The man, the lord of the land, said to us, 'By this I will know that you are honest men: leave one of your brothers with me, and take grain for the famine of your houses, and go your way.",brothers,0.05
3,3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,"Shimei had sixteen sons and six daughters; but his brothers didn't have many children, neither did all their family multiply like the children of Judah.",brothers,0.15
4,3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,He has put my brothers far from me.,brothers,0.263889


In [None]:
# get all the non-unique tokens for prediction
tokens = df[df['corpus'] == 'bible']['token'].dropna().to_list()
tokens = [token.lower() for token in tokens] # lowercase

# create a dataframe for handcrafted features: word length & word frequency
bible_train_df = pd.DataFrame(tokens, columns =['token'])

# add back complexity
bible_train_df['complexity'] = bible_df['complexity'].to_list()
bible_train_df['corpus'] = bible_df['corpus'].to_list()

# word length & word frequency
bible_train_df['word_length'] = bible_train_df['token'].map(lambda x: len(x))
bible_train_df['word_frequency'] = bible_train_df['token'].map(lambda x: bible_counter[x])

In [None]:
bible_train_df.head()

Unnamed: 0,token,complexity,corpus,word_length,word_frequency
0,river,0.0,bible,5,154
1,brothers,0.0,bible,8,622
2,brothers,0.05,bible,8,622
3,brothers,0.15,bible,8,622
4,brothers,0.263889,bible,8,622


#### create weight matrix

In [None]:
import numpy as np

In [None]:
def create_weights_matrix(vocab, dimension=100):
  """ create a matrix containing vectors for each word in Glove """
  matrix_len = len(vocab)
  weights_matrix = np.zeros((matrix_len, dimension))

  for i, word in enumerate(vocab):
      try: 
          weights_matrix[i] = glove[word]
      except KeyError:
          weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, )) # initialize a random vector
  #return torch.from_numpy(weights_matrix) # a tensor
  return weights_matrix

In [None]:
# create the weight matrix
weight_matrix = create_weights_matrix(tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)

train_df_combined = pd.concat([bible_train_df, weight_matrix_df], axis=1)
train_df_combined.head()

(2574, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,river,0.0,5,154,-0.33249,-0.56631,0.54255,-0.11869,0.53129,-0.49381,0.64114,0.85982,0.39633,-1.5395,-0.30613,0.97267,-0.31192,-0.10311,0.35951,-0.60023,0.90983,-0.95954,-0.55375,0.082818,0.26711,0.64645,-0.098556,0.53924,-0.2181,-0.1343,-1.807,-0.14879,0.39006,-0.62883,-0.38825,0.31925,0.77853,-0.60273,0.063585,-0.75916,...,-0.53185,0.72585,0.36811,0.19494,0.64276,0.8146,0.26748,-0.39275,0.42595,0.11699,0.21063,-0.061747,0.79298,-0.45978,0.85176,-0.36726,0.11816,0.50416,-0.065352,0.69672,0.37525,0.92586,-0.83036,-0.087948,-0.49715,0.21411,-0.82838,-0.85912,0.61576,1.188,-0.30745,-1.2009,-1.7097,0.514,-1.0159,0.55555,-1.0385,-0.6994,1.0506,0.24051
1,brothers,0.0,8,622,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
2,brothers,0.05,8,622,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
3,brothers,0.15,8,622,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
4,brothers,0.263889,8,622,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,-0.19383,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982


#### train model

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

In [None]:
# get data for training
X_train = train_df_combined.drop(columns=['token', 'complexity'])
Y_train = train_df_combined['complexity']

In [None]:
# train linear regression
lr = LinearRegression().fit(X_train, Y_train)

In [None]:
# predict
Y_pred = lr.predict(X_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.06963019167251543


#### test

In [None]:
# on test
test_tokens = test[test['corpus']=='bible']['token'].dropna().to_list()
test_tokens = [token.lower() for token in test_tokens] # lowercase
print(len(test_tokens))

# create a dataframe for linear regression
test_df = pd.DataFrame(test_tokens, columns =['token'])

test_df['complexity'] = test[test['corpus']=='bible']['complexity'].to_list()

# word length
test_df['word_length'] = test_df['token'].map(lambda x: len(x))
test_df['word_frequency'] = test_df['token'].map(lambda x: bible_counter[x])

283


In [None]:
test_df.head()

Unnamed: 0,token,complexity,word_length,word_frequency
0,hand,0.0,4,1435
1,hand,0.197368,4,1435
2,hand,0.2,4,1435
3,hand,0.267857,4,1435
4,entrance,0.0,8,51


In [None]:
# create the weight matrix
weight_matrix = create_weights_matrix(test_tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)
test_df_combined = pd.concat([test_df, weight_matrix_df], axis=1)
test_df_combined.head()

(283, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,hand,0.0,4,1435,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
1,hand,0.197368,4,1435,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
2,hand,0.2,4,1435,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
3,hand,0.267857,4,1435,-0.24945,0.37033,-0.058334,-0.25367,0.18709,0.8176,-0.045494,0.072066,-0.059079,-0.053018,-0.15681,-0.18621,0.78677,0.56263,0.023693,0.24116,0.034775,0.11763,-0.15757,-0.39749,0.21068,-0.14618,0.014017,-0.22373,0.54225,0.47379,-0.62683,-0.38803,0.2751,-0.54687,0.49211,0.052715,-0.12911,0.2554,-0.005657,-0.19431,...,0.64518,0.76559,-0.22193,0.39305,0.13373,-0.17641,0.36222,0.47786,-0.43591,-0.13363,-0.13145,0.20673,0.37353,-0.70188,0.53225,0.10371,-0.7094,0.24331,-0.15523,0.20785,1.1997,-0.036297,-0.79044,-0.27794,-1.4076,-0.36318,0.40219,0.17401,-0.080981,-0.40688,-0.044007,-0.14964,0.39369,-0.014732,-0.41309,-0.061931,-0.088387,-0.23093,0.93931,0.091475
4,entrance,0.0,8,51,0.25776,0.1068,-0.16265,0.42335,0.19078,0.46283,-0.95915,0.93174,0.47161,0.39077,0.54734,0.41967,0.086822,0.53954,0.35497,-0.028346,0.42708,0.036569,-0.497,-0.49543,-0.031232,-0.30298,-0.41718,-0.78459,0.70473,-0.59741,-0.33173,-0.38813,0.17189,-0.78565,-0.17219,-0.14019,0.61492,0.5713,0.75109,-0.015942,...,-0.60393,0.47454,0.80912,0.81709,-0.12876,-0.3931,0.17656,-0.29797,-0.32614,-0.26522,-0.37006,-0.016956,0.92268,-0.71606,-0.38524,-0.085737,0.68111,0.3208,0.4587,-0.82737,0.22932,0.3145,-0.21221,-0.65293,-0.31427,-0.037493,0.16126,-0.46719,0.63066,0.26426,0.52778,-0.34505,0.0662,0.7224,-0.11057,-0.005771,-0.059336,0.013272,0.97305,0.45405


In [None]:
# get data for test
X_test = test_df_combined.drop(columns=['token', 'complexity'])
Y_test = test_df_combined['complexity']

# predict
Y_pred = lr.predict(X_test)

# test loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_test[i])
  losses.append(loss)
abl = sum(losses) / num
print("average test absolute loss is " + str(abl))

average test absolute loss is 0.08314168441562919


### Biomedical CRAFT text

In [None]:
np.unique(df['corpus'])

array(['bible', 'biomed', 'europarl'], dtype=object)

In [None]:
biomed_df = df[df['corpus'] == 'biomed'].dropna()
print(biomed_df.shape)

(2573, 5)


In [None]:
biomed_df.head()

Unnamed: 0,id,corpus,sentence,token,complexity
2574,37ZQELHEQ0YDPGBEJ63D4HNT5SBNMJ,biomed,"In fact, this situation gave an opportunity to study the genetic control of arthritis in aged multiparous females, a common situation in human RA.",fact,0.0
2575,3XUSYT70IT170QDU572CAF4MOM1D0B,biomed,It can be inferred from this fact that Nrl is absolutely required for the normal silencing of cone-specific genes in rods.,fact,0.183333
2576,34R3P23QHS1HKWJHKAEN8VSOHJ9WH5,biomed,"The site of mutation is of interest, particularly the fact that in each of the three families the telomeric end of the deletion is anchored between exons 3 and 4 of SUMF1; sequence searches failed to identify any repeat sequences that might explain this phenomenon.",fact,0.3
2577,3L21G7IH47WA5QT3XMTQ15XXB1L1YG,biomed,This model reflects many other observed changes described in limited studies in humans.,studies,0.0
2578,3ZXNP4Z39RL4GD163NL987ME58H7LR,biomed,"Several studies have been carried out to detect gene expression during CIA, all of which used joints as the target tissue [15,16,21,22].",studies,0.125


In [None]:
# get all the non-unique tokens for prediction
biomed_tokens = biomed_df['token'].to_list()
biomed_tokens = [token.lower() for token in biomed_tokens] # lowercase

# create a dataframe for handcrafted features: word length & word frequency
biomed_train_df = pd.DataFrame(biomed_tokens, columns =['token'])

In [None]:
# add back complexity
biomed_train_df['complexity'] = biomed_df['complexity'].to_list()
biomed_train_df['corpus'] = biomed_df['corpus'].to_list()

In [None]:
# word length & word frequency
biomed_train_df['word_length'] = biomed_train_df['token'].map(lambda x: len(x))
biomed_train_df['word_frequency'] = biomed_train_df['token'].map(lambda x: craft_counter[x])

In [None]:
biomed_train_df.head()

Unnamed: 0,token,complexity,corpus,word_length,word_frequency
0,fact,0.0,biomed,4,111
1,fact,0.183333,biomed,4,111
2,fact,0.3,biomed,4,111
3,studies,0.0,biomed,7,499
4,studies,0.125,biomed,7,499


#### create weight matrix

In [None]:
# create the weight matrix
weight_matrix = create_weights_matrix(tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)

train_df_combined = pd.concat([biomed_train_df, weight_matrix_df], axis=1).dropna()
train_df_combined.head()

(2574, 100)


Unnamed: 0,token,complexity,corpus,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,fact,0.0,biomed,4.0,111.0,-0.33249,-0.56631,0.54255,-0.11869,0.53129,-0.49381,0.64114,0.85982,0.39633,-1.5395,-0.30613,0.97267,-0.31192,-0.10311,0.35951,-0.60023,0.90983,-0.95954,-0.55375,0.082818,0.26711,0.64645,-0.098556,0.53924,-0.2181,-0.1343,-1.807,-0.14879,0.39006,-0.62883,-0.38825,0.31925,0.77853,-0.60273,0.063585,...,-0.53185,0.72585,0.36811,0.19494,0.64276,0.8146,0.26748,-0.39275,0.42595,0.11699,0.21063,-0.061747,0.79298,-0.45978,0.85176,-0.36726,0.11816,0.50416,-0.065352,0.69672,0.37525,0.92586,-0.83036,-0.087948,-0.49715,0.21411,-0.82838,-0.85912,0.61576,1.188,-0.30745,-1.2009,-1.7097,0.514,-1.0159,0.55555,-1.0385,-0.6994,1.0506,0.24051
1,fact,0.183333,biomed,4.0,111.0,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
2,fact,0.3,biomed,4.0,111.0,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
3,studies,0.0,biomed,7.0,499.0,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982
4,studies,0.125,biomed,7.0,499.0,0.85968,-0.39038,-0.62678,-0.55279,0.097012,0.00658,-0.65021,-0.58272,-1.2763,0.11251,0.78504,0.16027,0.38327,0.62672,-0.017462,-0.36443,0.062441,0.039266,-0.47318,0.54768,0.42916,-0.25516,0.1009,0.041618,-0.14579,0.15174,-0.54301,-0.29787,0.36268,0.8955,0.65319,0.40141,0.03668,-0.34313,-0.10204,...,0.65085,0.97366,0.36997,-0.58266,0.58389,-0.62574,-0.24252,1.375,-0.042651,0.16398,-0.53462,0.55275,-0.58019,-0.78386,-0.18787,-0.20305,0.11506,-0.089296,-0.76608,0.04339,0.50251,0.73799,0.23388,0.20038,-0.93906,-0.33974,-0.56534,-0.95945,-0.14597,-0.35173,-0.40463,-0.32671,0.24982,-0.27804,-0.99877,-0.39367,-0.30087,-0.24623,0.006483,-0.21982


#### train model

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

In [None]:
# get data for training
X_train = train_df_combined.drop(columns=['token', 'complexity', 'corpus'])
Y_train = train_df_combined['complexity']

In [None]:
# train linear regression
lr = LinearRegression().fit(X_train, Y_train)

In [None]:
# predict
Y_pred = lr.predict(X_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.0859900500618725


#### test

In [None]:
# on test
test_tokens = test[test['corpus']=='biomed']['token'].dropna().to_list()
test_tokens = [token.lower() for token in test_tokens] # lowercase
print(len(test_tokens))

# create a dataframe for linear regression
test_df = pd.DataFrame(test_tokens, columns =['token'])

test_df['complexity'] = test[test['corpus']=='biomed']['complexity'].to_list()

# word length
test_df['word_length'] = test_df['token'].map(lambda x: len(x))
test_df['word_frequency'] = test_df['token'].map(lambda x: craft_counter[x])

289


In [None]:
# create the weight matrix
weight_matrix = create_weights_matrix(test_tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)
test_df_combined = pd.concat([test_df, weight_matrix_df], axis=1)
test_df_combined.head()

(289, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,role,0.0,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
1,role,0.203125,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
2,role,0.205882,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
3,role,0.233333,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
4,role,0.234375,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077


In [None]:
# create the weight matrix
weight_matrix = create_weights_matrix(test_tokens)
print(weight_matrix.shape)

# combine
weight_matrix_df = pd.DataFrame(weight_matrix)
test_df_combined = pd.concat([test_df, weight_matrix_df], axis=1)
test_df_combined.head()

(289, 100)


Unnamed: 0,token,complexity,word_length,word_frequency,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,role,0.0,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
1,role,0.203125,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
2,role,0.205882,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
3,role,0.233333,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077
4,role,0.234375,4,557,0.22823,-0.30561,0.23573,0.23894,0.32533,-0.32454,-0.50807,-0.58703,-0.087776,-0.07267,-0.56668,-0.74275,0.014825,0.51961,-0.23518,-0.1145,0.46887,-0.13751,0.14364,0.69593,-0.12991,-0.52152,-0.057358,-0.10104,0.23093,0.0277,-0.052399,-0.40272,0.079033,0.76705,-0.24874,0.36638,0.21313,-0.26697,-0.10206,-0.33825,...,-0.30744,0.71101,0.036471,0.085221,1.182,-0.37167,0.22308,0.34762,0.20448,0.74082,0.56632,-0.82579,-0.38462,-0.31381,-0.006504,-0.68809,-0.59355,-0.128,-0.2531,-0.050959,-0.063659,0.22352,0.44452,0.3194,-1.809,0.10573,-0.1557,0.23939,-0.38085,-1.2818,-0.48694,-0.090461,0.085937,0.36047,0.22642,0.097908,-0.037121,-0.46676,0.35424,0.59077


In [None]:
# get data for test
X_test = test_df_combined.drop(columns=['token', 'complexity'])
Y_test = test_df_combined['complexity']

# predict
Y_pred = lr.predict(X_test)

# test loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
  loss = abs(Y_pred[i] - Y_test[i])
  losses.append(loss)
abl = sum(losses) / num
print("average test absolute loss is " + str(abl))

average test absolute loss is 0.08692959286863015


### Bible + Biomedical

In [None]:
bible_train_df.head()

Unnamed: 0,token,complexity,corpus,word_length,word_frequency
0,river,0.0,bible,5,154
1,brothers,0.0,bible,8,622
2,brothers,0.05,bible,8,622
3,brothers,0.15,bible,8,622
4,brothers,0.263889,bible,8,622


In [None]:
bible_train_df.shape

(2574, 5)

In [None]:
biomed_train_df.head()

Unnamed: 0,token,complexity,corpus,word_length,word_frequency
0,fact,0.0,biomed,4,111
1,fact,0.183333,biomed,4,111
2,fact,0.3,biomed,4,111
3,studies,0.0,biomed,7,499
4,studies,0.125,biomed,7,499


In [None]:
biomed_train_df.shape

(2573, 5)

In [None]:
combined_train_df = pd.concat([bible_train_df, biomed_train_df]).reset_index(drop=True)

In [None]:
print(np.unique(combined_train_df['corpus']))
combined_train_df.iloc[2572:].head()

['bible' 'biomed']


Unnamed: 0,token,complexity,corpus,word_length,word_frequency
2572,agee,0.675,bible,4,1
2573,tou,0.825,bible,3,2
2574,fact,0.0,biomed,4,111
2575,fact,0.183333,biomed,4,111
2576,fact,0.3,biomed,4,111


In [None]:
combined_train_df.shape

(5147, 5)

In [None]:
# get data for training
X_train = combined_train_df.drop(columns=['token', 'complexity', 'corpus'])
Y_train = combined_train_df['complexity']

# train linear regression
lr = LinearRegression().fit(X_train, Y_train)

# predict
Y_pred = lr.predict(X_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_pred)
losses = []
for i in range(num):
    loss = abs(Y_pred[i] - Y_train[i])
    losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.10980264244641504


## 2.2 Linear regression with ELMo word embedding

### 2.2.1 Only ELMo vectors

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression

#### bible text

In [None]:
len(elmo_embedded_word)

2574

In [None]:
elmo_bible_df = bible_df.iloc[:len(elmo_embedded_word)]

In [None]:
elmo_embedded_df = pd.DataFrame(elmo_embedded_word)

In [None]:
elmo_embedded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.380389,-0.176156,-0.314544,-0.501822,-0.090818,0.105634,-0.251023,-0.066272,0.243031,-0.761172,-0.291393,0.064379,-0.453321,-0.423277,0.816208,-0.353879,0.408419,0.122125,0.496743,0.353603,-0.351386,-0.922782,-0.209045,-0.359489,0.711408,-0.171895,-0.333597,0.793782,-0.901847,0.509671,0.247244,0.280223,0.268059,-0.539889,-0.103087,0.291161,0.992129,0.264218,-0.22455,0.470073,...,0.257131,-1.014234,0.116352,0.884805,0.771222,-0.382885,-0.389935,-0.064094,0.188469,-0.25247,-0.016002,-0.283068,-0.100671,0.204859,0.546275,0.513434,0.094389,0.732664,-0.115683,0.189742,0.107119,0.150636,-0.038369,-0.243636,-0.724234,-0.076422,0.054119,0.1134,0.372316,0.30305,-0.009343,0.357153,0.287225,0.254209,-0.150873,-0.407083,-0.230057,0.821129,0.205928,0.648317
1,0.080406,-0.40823,0.158734,-0.079608,0.255208,-0.493177,0.061747,0.495774,0.45792,0.361132,-0.256219,0.261946,0.350431,0.18085,0.36866,0.22267,0.221042,0.00736,0.40547,-0.55929,-0.000918,-0.280063,-0.229276,0.055031,0.096481,-0.122666,0.458498,0.230201,-0.23404,0.323799,-0.577974,-0.23127,0.405233,-0.492054,-0.262013,-0.562029,-0.754784,0.114774,-0.30307,0.00418,...,0.921801,0.091345,0.728693,1.069491,-0.089532,-0.126689,-0.05313,0.416477,0.294666,0.006618,0.328412,-0.557811,0.253492,-0.354477,0.3329,0.875495,-1.043192,1.28843,-0.056559,0.187961,0.337483,-0.090545,0.276176,-0.393746,-0.89701,0.273594,0.059733,0.252737,0.207929,-0.025753,0.021567,1.168533,0.562797,-0.284247,-0.291426,0.656827,0.652902,-0.680608,0.406645,-0.072077
2,0.122995,-0.255338,-0.08461,-0.139912,0.065407,0.552848,-0.104535,0.109842,0.90361,0.451307,-0.275471,0.095294,0.099199,-0.228622,0.185442,0.079517,0.633629,-0.238946,0.044195,-0.7192,0.28101,-0.201918,-0.225704,-0.157153,-0.377018,-0.045188,0.433793,0.688715,-0.552029,0.3779,-0.080042,-0.302301,-0.597616,-0.443838,-0.02264,-0.56867,-0.807511,0.083843,0.715118,0.486972,...,0.996324,-0.098625,0.389538,1.594374,-0.013069,-0.453419,-0.605789,-0.17398,0.04237,-0.014586,0.30277,-0.32325,0.146018,-0.50915,0.194316,0.767718,-0.213173,0.564365,-0.042883,0.149758,0.152589,-0.061916,0.154927,-0.081459,-0.231792,-0.260478,0.171444,0.504945,0.628526,-0.431658,0.408604,1.460399,0.662861,-0.136868,-0.080981,0.412476,0.408365,-0.432834,0.264197,-0.065345
3,-0.233317,0.075169,0.101757,-0.255629,0.177177,0.213724,-0.04996,0.879802,0.228995,0.333588,-0.258793,0.213695,0.08974,-0.462861,0.45921,0.342702,0.284974,-0.314406,0.142398,-0.881451,0.01107,-0.360885,-0.327134,-0.118879,0.195584,-0.184106,0.189582,-0.031402,-0.119927,0.90529,-0.042242,-0.078218,0.774182,-0.607643,0.057612,-0.598457,-1.372679,0.186787,0.108968,0.402878,...,0.939276,0.299659,0.282591,1.413206,-0.172405,0.39143,-0.400962,-0.049452,-0.075253,-0.030729,0.538451,-0.235912,0.223105,-0.138607,-0.543708,0.693104,-0.676663,0.191055,0.044298,0.051567,0.243266,0.239512,0.250765,-0.325058,-0.579753,-0.275222,0.146736,0.051135,0.469655,-0.196534,-0.15126,1.477482,0.264394,0.07645,-0.087113,0.556019,0.54103,-0.189591,0.507491,0.203926
4,0.045292,-0.214803,0.358676,-0.601393,-0.048681,0.365268,-0.550482,0.514303,0.559916,0.367665,-0.120503,0.459812,0.799864,0.429512,0.348144,0.705791,-0.132481,-0.198132,0.894733,-0.93105,0.008356,-0.012287,0.185829,-0.012477,-0.029897,-0.106955,0.921338,0.438324,-0.385043,0.21003,-0.644655,-0.322157,0.171714,-0.011876,-0.427351,-0.764196,-0.182957,0.413742,-0.472179,0.365791,...,0.603304,-0.00274,-0.036127,1.148385,-0.468836,-0.008,-0.349761,0.062257,0.036226,0.254457,0.52191,-0.143204,0.411426,-0.176269,0.035457,0.314054,-0.594687,0.450245,0.020033,-0.200571,0.566393,0.07958,0.120403,-0.056354,-0.183325,-0.331903,-0.132743,0.091287,0.731343,-0.135945,-0.014588,1.263425,0.616776,-0.136729,-0.131905,0.233556,0.458229,-0.104712,0.049442,0.220563


In [None]:
elmo_bible_df_combined = pd.concat([elmo_bible_df, elmo_embedded_df], axis=1).dropna()
elmo_bible_df_combined.head(2)

Unnamed: 0,id,corpus,sentence,token,complexity,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,"Behold, there came up out of the river seven c...",river,0.0,0.380389,-0.176156,-0.314543,-0.501822,-0.090818,0.105634,-0.251023,-0.066272,0.243032,-0.761172,-0.291393,0.064379,-0.453321,-0.423276,0.816208,-0.353879,0.408419,0.122126,0.496743,0.353603,-0.351386,-0.922783,-0.209045,-0.359489,0.711408,-0.171895,-0.333597,0.793782,-0.901847,0.509671,0.247244,0.280223,0.268059,-0.539888,-0.103087,...,0.257131,-1.014234,0.116351,0.884804,0.771221,-0.382885,-0.389934,-0.064094,0.188469,-0.25247,-0.016003,-0.283068,-0.100671,0.20486,0.546275,0.513434,0.09439,0.732664,-0.115683,0.189742,0.107118,0.150636,-0.038369,-0.243636,-0.724234,-0.076423,0.054118,0.113401,0.372316,0.30305,-0.009343,0.357153,0.287225,0.254209,-0.150872,-0.407083,-0.230057,0.821129,0.205928,0.648316
1,34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,I am a fellow bondservant with you and with yo...,brothers,0.0,0.080406,-0.40823,0.158734,-0.079608,0.255208,-0.493177,0.061747,0.495774,0.45792,0.361132,-0.256218,0.261946,0.350431,0.18085,0.36866,0.22267,0.221042,0.00736,0.40547,-0.55929,-0.000919,-0.280063,-0.229275,0.055031,0.096481,-0.122666,0.458498,0.230201,-0.23404,0.323799,-0.577974,-0.23127,0.405233,-0.492053,-0.262013,...,0.9218,0.091345,0.728693,1.06949,-0.089532,-0.126689,-0.05313,0.416477,0.294667,0.006618,0.328412,-0.557811,0.253493,-0.354477,0.3329,0.875495,-1.043192,1.28843,-0.056559,0.187961,0.337483,-0.090544,0.276176,-0.393747,-0.89701,0.273594,0.059733,0.252738,0.207929,-0.025753,0.021567,1.168533,0.562797,-0.284247,-0.291426,0.656828,0.652902,-0.680609,0.406645,-0.072077


In [None]:
X_elmo_train = elmo_bible_df_combined.drop(columns=['token', 'complexity', 'id', 'corpus', 'sentence'])
Y_elmo_train = elmo_bible_df_combined['complexity']

In [None]:
#train
lr = LinearRegression().fit(X_elmo_train, Y_elmo_train)

In [None]:
# predict
Y_elmo_pred = lr.predict(X_elmo_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_elmo_pred)
losses = []
for i in range(num):
  loss = abs(Y_elmo_pred[i] - Y_elmo_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.056465208341030346


#### biomed text

In [None]:
len(elmo_embedded_word_biomed)

2573

In [None]:
elmo_embedded_df = pd.DataFrame(elmo_embedded_word_biomed)

In [None]:
elmo_embedded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.249135,0.301341,-0.119462,-0.165009,0.765487,-0.436227,-0.187499,0.598034,-0.107632,-0.548476,0.071216,-0.454115,-0.609256,-0.085315,0.182125,-0.360382,0.698889,0.21102,-0.414465,-0.339724,0.216048,0.611558,-0.655546,-0.334238,0.11225,-0.070516,-0.37981,0.159027,0.294165,0.093665,-0.516576,0.843412,0.548272,-0.200983,0.401843,0.739335,-0.079382,0.734073,-0.02183,-0.03033,...,0.390382,0.592235,-0.066926,0.538355,0.38122,-0.546264,0.30621,0.22983,-0.221477,0.215869,0.324879,0.364913,-0.818323,0.337191,-0.130576,-0.005982,0.360649,1.190377,0.520165,0.218456,0.609946,0.143497,-0.234697,-0.198283,-0.364801,0.765781,0.027709,0.063312,-0.490597,-0.790916,-0.658159,0.794876,-0.42629,1.299343,0.077469,-0.107005,-0.084834,0.234907,1.108247,-0.141695
1,-0.209395,0.092374,0.18803,-0.053685,0.300444,-0.475233,-0.219341,0.540716,0.276497,-0.421695,0.145761,-0.7052,-0.501235,-0.324497,0.389686,-0.093564,0.395981,0.266107,-0.826801,-0.431247,0.094979,0.878984,-0.713791,-0.245768,0.403367,-0.041332,-0.610263,0.084179,0.171739,-0.154766,-0.35777,0.581577,0.435827,-0.165338,0.004696,0.310048,-0.540961,0.197793,-0.293152,0.349397,...,-0.179397,0.746669,-0.32432,0.515122,0.349148,-0.157825,0.394441,0.255929,-0.234665,0.576432,0.764406,0.184289,-0.237181,0.455036,-0.640105,0.035559,0.359653,0.576041,0.696428,-0.036704,0.844,-0.05186,0.155767,-0.79037,0.041583,0.184688,0.426555,-0.024136,-0.722083,-0.552633,-0.687676,0.288225,-0.189121,0.906465,-0.286121,-0.304948,-0.017349,0.464363,1.395325,-0.543135
2,0.004268,0.106336,0.048399,0.09116,0.428347,-0.130098,-0.662575,0.080266,0.01332,-0.349151,-0.134293,-0.499444,-0.373022,-0.811964,-0.25442,0.090195,0.32011,0.13574,-0.848244,-0.681392,-0.260061,0.644433,-0.526644,-0.276316,0.478502,0.2212,-0.310365,-0.00754,-0.148946,-0.085337,-0.695665,0.242075,-0.028677,-0.011638,0.34732,0.68978,-0.489443,0.415778,0.085379,0.386972,...,0.265192,0.929116,0.162578,0.648021,0.628686,0.09863,0.378123,-0.118605,-0.252177,0.242246,0.54555,0.231111,-0.393072,-0.108139,-0.670283,0.05348,0.705718,0.645969,0.059854,0.36426,1.073266,0.035162,0.209238,-1.030465,-0.330945,0.363856,0.255278,0.448532,-0.205063,-0.269521,-0.961838,0.316978,-0.526094,1.129726,-0.595307,0.273254,0.385535,0.363886,0.670912,-0.700466
3,-0.656302,0.294128,0.483853,0.401751,-0.017931,-0.106219,-0.064058,0.625132,-0.006444,0.457555,-0.329192,-0.139087,-0.697591,-0.711996,0.221232,-0.50977,-0.311725,0.334013,-0.235573,-0.170434,0.836873,0.91888,-0.396359,0.968326,0.680407,0.614958,0.13593,-0.394977,1.163188,-0.541172,-0.261355,0.158786,-0.659484,0.473833,0.066817,0.283244,-0.087377,-0.322293,-0.820412,0.095642,...,0.161497,0.293411,-0.426758,0.782205,-0.023183,-0.085585,-0.077754,0.476447,-0.159901,-0.076207,-0.052923,0.14996,0.480628,0.55474,-0.301679,0.906053,0.465168,0.47159,0.410943,0.167679,0.718602,0.274036,0.332569,-0.51233,-0.346158,0.078268,0.103304,0.375284,-0.148222,-0.561776,0.786104,0.464891,-0.311528,0.515165,0.535838,0.665578,-0.036052,0.755543,0.615258,0.281403
4,-1.044316,0.130674,0.871212,0.20748,0.321069,-0.033286,-0.572216,0.117487,-0.314405,0.011628,-0.222316,-0.077684,-0.467279,-0.092483,0.093128,-0.861192,-0.306695,0.181685,-0.081562,0.389218,0.826945,0.818893,-0.056679,0.960478,0.137524,0.562749,-0.403851,-0.603186,1.020413,-0.204632,-0.139348,0.357224,-0.469534,0.380403,-0.096406,0.105757,0.119609,-0.044899,-0.747118,-0.5373,...,-0.110889,0.397599,0.063661,0.916914,0.264422,0.219644,-0.084242,0.115164,0.138273,0.31708,-0.477551,0.427475,0.095759,0.134601,-0.591363,0.853671,0.034934,0.134267,0.194319,0.278496,1.238392,0.046799,0.306551,-0.498437,-0.21375,0.166719,0.00737,0.783071,-0.748166,-0.305757,0.309987,0.624318,0.240341,1.563969,0.193844,1.239616,1.327217,-0.131656,0.766625,-0.220302


In [None]:
biomed_df_rest = biomed_df.reset_index(drop=True)
biomed_df_rest.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,37ZQELHEQ0YDPGBEJ63D4HNT5SBNMJ,biomed,"In fact, this situation gave an opportunity to...",fact,0.0
1,3XUSYT70IT170QDU572CAF4MOM1D0B,biomed,It can be inferred from this fact that Nrl is ...,fact,0.183333
2,34R3P23QHS1HKWJHKAEN8VSOHJ9WH5,biomed,"The site of mutation is of interest, particula...",fact,0.3
3,3L21G7IH47WA5QT3XMTQ15XXB1L1YG,biomed,This model reflects many other observed change...,studies,0.0
4,3ZXNP4Z39RL4GD163NL987ME58H7LR,biomed,Several studies have been carried out to detec...,studies,0.125


In [None]:
elmo_biomed_df_combined = pd.concat([biomed_df_rest, elmo_embedded_df], axis=1).dropna()
elmo_biomed_df_combined.head(2)

Unnamed: 0,id,corpus,sentence,token,complexity,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,37ZQELHEQ0YDPGBEJ63D4HNT5SBNMJ,biomed,"In fact, this situation gave an opportunity to...",fact,0.0,-0.249135,0.301341,-0.119462,-0.165009,0.765487,-0.436227,-0.187499,0.598034,-0.107632,-0.548476,0.071216,-0.454115,-0.609256,-0.085315,0.182125,-0.360382,0.698889,0.21102,-0.414465,-0.339724,0.216048,0.611558,-0.655546,-0.334238,0.11225,-0.070516,-0.37981,0.159027,0.294165,0.093665,-0.516576,0.843412,0.548272,-0.200983,0.401843,...,0.390382,0.592235,-0.066926,0.538355,0.38122,-0.546264,0.30621,0.22983,-0.221477,0.215869,0.324879,0.364913,-0.818323,0.337191,-0.130576,-0.005982,0.360649,1.190377,0.520165,0.218456,0.609946,0.143497,-0.234697,-0.198283,-0.364801,0.765781,0.027709,0.063312,-0.490597,-0.790916,-0.658159,0.794876,-0.42629,1.299343,0.077469,-0.107005,-0.084834,0.234907,1.108247,-0.141695
1,3XUSYT70IT170QDU572CAF4MOM1D0B,biomed,It can be inferred from this fact that Nrl is ...,fact,0.183333,-0.209395,0.092374,0.18803,-0.053685,0.300444,-0.475233,-0.219341,0.540716,0.276497,-0.421695,0.145761,-0.7052,-0.501235,-0.324497,0.389686,-0.093564,0.395981,0.266107,-0.826801,-0.431247,0.094979,0.878984,-0.713791,-0.245768,0.403367,-0.041332,-0.610263,0.084179,0.171739,-0.154766,-0.35777,0.581577,0.435827,-0.165338,0.004696,...,-0.179397,0.746669,-0.32432,0.515122,0.349148,-0.157825,0.394441,0.255929,-0.234665,0.576432,0.764406,0.184289,-0.237181,0.455036,-0.640105,0.035559,0.359653,0.576041,0.696428,-0.036704,0.844,-0.05186,0.155767,-0.79037,0.041583,0.184688,0.426555,-0.024136,-0.722083,-0.552633,-0.687676,0.288225,-0.189121,0.906465,-0.286121,-0.304948,-0.017349,0.464363,1.395325,-0.543135


In [None]:
X_elmo_train = elmo_biomed_df_combined.drop(columns=['token', 'complexity', 'id', 'corpus', 'sentence'])
Y_elmo_train = elmo_biomed_df_combined['complexity']

In [None]:
#train
lr = LinearRegression().fit(X_elmo_train, Y_elmo_train)
# predict
Y_elmo_pred = lr.predict(X_elmo_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_elmo_pred)
losses = []
for i in range(num):
  loss = abs(Y_elmo_pred[i] - Y_elmo_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.055058606649159264


#### europarl text

In [None]:
len(elmo_embedded_word_europarl)

2512

In [None]:
elmo_embedded_df = pd.DataFrame(elmo_embedded_word_europarl)
europarl_df_rest = europarl_df.reset_index(drop=True)
europarl_df_rest.head()

Unnamed: 0,id,corpus,sentence,token,complexity
0,3Y40HMYLL1I1EIURUEH8TTVLKTKUX0,europarl,Despite the fact that the Treaty does not requ...,fact,0.15625
1,3OZ4VAIBEXF0WDE2I0CCY6PPN3VVJL,europarl,The average consumption in the EU fluctuates b...,fact,0.236842
2,3NFWQRSHVEE19E2BAFM5J7UN7HQFGD,europarl,The main Charlemagne Prize was presented on 13...,days,0.111111
3,3TZ0XG8CBUKDFP5GOVAPHYREGZ298H,europarl,"Commissioner, ladies and gentlemen, we have al...",days,0.116667
4,3M7OI89LVYOS99TV70NIZAWVGPFC6F,europarl,(For the outcome and other details of the vote...,details,0.075


In [None]:
elmo_europarl_df_combined = pd.concat([europarl_df_rest, elmo_embedded_df], axis=1).dropna()
elmo_europarl_df_combined.head(2)

Unnamed: 0,id,corpus,sentence,token,complexity,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,3Y40HMYLL1I1EIURUEH8TTVLKTKUX0,europarl,Despite the fact that the Treaty does not requ...,fact,0.15625,-0.062483,-0.055144,-0.155203,-0.49194,0.352888,-0.19298,-0.412096,0.403305,0.068579,-0.313935,0.181159,-0.509768,0.177966,-0.466925,-0.029146,-0.960633,0.098824,0.099316,-0.595174,-0.087753,-0.102547,0.324549,-0.649567,-0.214384,0.195159,-0.588359,-0.495798,0.537245,-0.063481,-0.023324,-0.114683,0.415884,-0.199562,0.04679,0.562504,...,-0.186078,0.61234,0.225529,0.678157,0.745884,-1.030736,-0.209031,0.160173,-0.074211,0.097675,0.326623,0.251966,-0.35753,-0.27805,-0.130093,0.008479,0.678406,0.874198,0.522959,0.211588,0.576135,0.158939,-0.411495,-0.067965,-0.255885,0.502913,0.262348,0.30404,-0.364055,-0.362189,-0.669834,0.353208,-0.099285,0.63463,-0.39545,-0.092322,0.057856,0.352509,0.653409,-0.288506
1,3OZ4VAIBEXF0WDE2I0CCY6PPN3VVJL,europarl,The average consumption in the EU fluctuates b...,fact,0.236842,0.124128,0.185605,-0.219204,-0.208332,0.641745,-0.635547,-0.469293,0.677127,-0.250691,-0.275064,0.049173,-0.269244,-0.245794,-0.567822,0.422416,-0.92095,0.26698,-0.360288,-0.430102,0.095514,-0.007045,0.403574,-1.095425,-0.288699,-0.167566,-0.206041,-0.666917,0.326973,0.012166,-0.489464,-0.036835,0.45317,-0.533364,-0.336073,0.472662,...,-0.054559,-0.172528,-0.229225,0.381155,0.041883,-0.178965,0.34174,-0.033911,0.075517,0.581826,0.506699,0.176598,-0.284019,-0.126847,-0.566817,0.023886,0.827396,0.285697,0.230525,0.358416,0.145088,-0.003469,0.171849,-0.792913,-0.755383,0.041791,0.06798,0.252083,0.078681,-0.064716,-0.886729,0.17335,-0.408417,0.933623,-0.292979,-0.229235,0.007479,-0.014983,0.274173,-0.120015


In [None]:
X_elmo_train = elmo_europarl_df_combined.drop(columns=['token', 'complexity', 'id', 'corpus', 'sentence'])
Y_elmo_train = elmo_europarl_df_combined['complexity']

In [None]:
#train
lr = LinearRegression().fit(X_elmo_train, Y_elmo_train)
# predict
Y_elmo_pred = lr.predict(X_elmo_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_elmo_pred)
losses = []
for i in range(num):
  loss = abs(Y_elmo_pred[i] - Y_elmo_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.04489585089343337


#### biomed + bible + europarl text

In [None]:
elmo_combined_df = pd.concat([elmo_biomed_df_combined, elmo_bible_df_combined, elmo_europarl_df_combined]).reset_index(drop=True)

In [None]:
X_elmo_train = elmo_combined_df.drop(columns=['token', 'complexity', 'id', 'corpus', 'sentence'])
Y_elmo_train = elmo_combined_df['complexity']
#train
lr = LinearRegression().fit(X_elmo_train, Y_elmo_train)
# predict
Y_elmo_pred = lr.predict(X_elmo_train)

In [None]:
# train loss (average absolute loss)
num = len(Y_elmo_pred)
losses = []
for i in range(num):
  loss = abs(Y_elmo_pred[i] - Y_elmo_train[i])
  losses.append(loss)
abl = sum(losses) / num
print("average training absolute loss is " + str(abl))

average training absolute loss is 0.0678142040275123


#### biomed + bible + europarl test

In [None]:
np.unique(test['corpus'])

array(['bible', 'biomed', 'europarl'], dtype=object)

In [None]:
# on test
test_df = test.dropna()
test_df.head(3)

Unnamed: 0,id,corpus,sentence,token,complexity
0,3K8CQCU3KE19US5SN890DFPK3SANWR,bible,"But he, beckoning to them with his hand to be ...",hand,0.0
1,3Q2T3FD0ON86LCI41NJYV3PN0BW3MV,bible,"If I forget you, Jerusalem, let my right hand ...",hand,0.197368
2,3ULIZ0H1VA5C32JJMKOTQ8Z4GUS51B,bible,"the ten sons of Haman the son of Hammedatha, t...",hand,0.2


In [None]:
test_elmo_vectors, test_index_token = compute_elmo(test_df, batch_size=250)

-------------
26
Go yourselves, get straw where you can find it, for nothing of your work shall be diminished.'
['go', 'get', 'straw', 'find', 'nothing', 'work', 'shall', 'diminished']
yourselves
COULD NOT FIND WORD!!
-------------
27
Separate yourselves from among this congregation, that I may consume them in a moment!
['separate', 'among', 'congregation', 'may', 'consume', 'moment']
yourselves
COULD NOT FIND WORD!!
-------------
28
Neither present your members to sin as instruments of unrighteousness, but present yourselves to God, as alive from the dead, and your members as instruments of righteousness to God.
['neither', 'present', 'members', 'sin', 'instruments', 'unrighteousness', 'present', 'god', 'alive', 'dead', 'members', 'instruments', 'righteousness', 'god']
yourselves
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
250
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
500
-------------
68
A number of target gene candidates emerged upon a transcriptome analysis of mock- and cadmium-treated Mtf1 conditional knockout mice and control littermates and several of these were confirmed by semiquantitative RT–PCR.
['number', 'target', 'gene', 'candidates', 'emerged', 'upon', 'transcriptome', 'analysis', 'mock', 'cadmiumtreated', 'conditional', 'knockout', 'mice', 'control', 'littermates', 'several', 'confirmed', 'semiquantitative']
RT
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...
750
-------------
163
Section V - Court of Auditors (SEC(2002) 405 - C5-0245/2002 - 2002/2105(DEC))
['section', 'v', 'court', 'auditors']
DEC
COULD NOT FIND WORD!!
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Run session...


In [None]:
test_elmo_embedded_word = word_embedding_vectors(test_elmo_vectors, test_index_token)

In [None]:
test_elmo_embedded_df = pd.DataFrame(test_elmo_embedded_word)

In [None]:
test_elmo_embedded_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.076672,0.251269,0.248113,-0.127476,0.913393,0.010417,0.02101,0.411688,0.30292,-0.295847,-0.555969,0.724456,0.144696,0.280112,-0.077631,-0.125358,0.665099,0.34296,-0.735821,-0.109446,0.098682,-0.469227,-0.055438,0.003875,0.231853,0.603377,0.493155,-0.009787,-0.425775,-0.257469,-0.534945,1.361402,-0.795937,-0.521291,-0.472753,1.325098,-0.328976,-0.329317,0.071158,-0.333782,...,-0.167519,0.269576,0.117109,0.516681,-0.330714,-0.231196,-0.223731,-0.287138,-0.133118,0.415258,-0.387935,-0.203408,0.457442,-0.120529,0.380631,0.519042,-0.516842,0.864598,0.103869,0.028991,0.264386,-0.402229,-0.157448,-0.265398,-0.005568,-0.241656,0.017691,0.225574,-0.431134,-0.067932,0.436614,0.033881,-0.114014,-0.387926,0.383397,0.301548,0.170494,-0.441685,-0.303746,0.344226
1,0.186335,0.261939,0.165064,0.17973,0.636397,0.110275,-0.351911,0.191896,-0.068916,-0.445865,-0.274317,0.389419,0.110434,0.351981,0.186977,0.31902,1.183099,-0.233907,-0.440433,-0.142376,0.11677,-1.337285,-0.299207,0.182616,0.408157,-0.19785,0.468526,0.095231,-0.311969,0.405481,0.100751,0.266695,-0.339333,-0.061059,-0.376933,0.450449,0.16995,-0.181369,0.249988,-0.193027,...,0.059452,0.285154,0.063436,1.162927,-0.837117,-0.231743,-0.131206,-0.448204,-0.266107,0.112765,-0.537655,0.243628,-0.080022,-0.357984,-0.06287,0.026959,-0.225966,1.003935,-0.423095,0.429774,-0.103236,-0.606424,-0.152112,-0.216584,0.111076,-0.447714,0.198011,0.199941,-0.228155,-0.466801,-0.267862,0.069625,-0.561907,-0.33497,0.777656,0.084498,0.572794,-1.140565,0.218136,0.45798
2,-0.431774,0.333561,-0.189042,-0.787152,0.95764,0.266613,-0.380373,-0.030173,-0.110427,-0.376553,-0.151679,0.44071,-0.071627,0.060964,0.141926,0.416507,1.050251,-0.169958,-0.485965,-0.191544,0.282509,-0.499898,-0.220933,0.222027,0.30791,-0.734785,0.181991,0.948376,-0.520131,0.072885,-0.113285,1.58095,-1.315244,-0.079169,-0.329803,1.084099,0.03403,-0.28933,0.74916,-0.167066,...,-0.088724,0.314198,0.331289,0.670614,-0.663307,-0.067224,-0.420759,-0.395921,-0.506717,0.428003,-0.351647,-0.078777,0.231768,-0.148233,0.04763,0.004854,-0.13808,0.690775,-0.283167,-0.005564,-0.028439,-0.613986,-0.082469,-0.486614,-0.146418,-0.348809,0.043005,0.060377,-0.160934,-0.330459,-0.306722,0.233601,-0.064477,0.069866,0.339952,0.256648,0.485293,-0.139413,0.137243,0.101267


In [None]:
elmo_test_df_combined = pd.concat([test_df, test_elmo_embedded_df], axis=1).dropna()
elmo_test_df_combined.head(2)

Unnamed: 0,id,corpus,sentence,token,complexity,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,3K8CQCU3KE19US5SN890DFPK3SANWR,bible,"But he, beckoning to them with his hand to be ...",hand,0.0,-0.076672,0.251269,0.248113,-0.127476,0.913393,0.010417,0.02101,0.411688,0.30292,-0.295847,-0.555969,0.724456,0.144696,0.280112,-0.077631,-0.125358,0.665099,0.34296,-0.735821,-0.109446,0.098682,-0.469227,-0.055438,0.003875,0.231853,0.603377,0.493155,-0.009787,-0.425775,-0.257469,-0.534945,1.361402,-0.795937,-0.521291,-0.472753,...,-0.167519,0.269576,0.117109,0.516681,-0.330714,-0.231196,-0.223731,-0.287138,-0.133118,0.415258,-0.387935,-0.203408,0.457442,-0.120529,0.380631,0.519042,-0.516842,0.864598,0.103869,0.028991,0.264386,-0.402229,-0.157448,-0.265398,-0.005568,-0.241656,0.017691,0.225574,-0.431134,-0.067932,0.436614,0.033881,-0.114014,-0.387926,0.383397,0.301548,0.170494,-0.441685,-0.303746,0.344226
1,3Q2T3FD0ON86LCI41NJYV3PN0BW3MV,bible,"If I forget you, Jerusalem, let my right hand ...",hand,0.197368,0.186335,0.261939,0.165064,0.17973,0.636397,0.110275,-0.351911,0.191896,-0.068916,-0.445865,-0.274317,0.389419,0.110434,0.351981,0.186977,0.31902,1.183099,-0.233907,-0.440433,-0.142376,0.11677,-1.337285,-0.299207,0.182616,0.408157,-0.19785,0.468526,0.095231,-0.311969,0.405481,0.100751,0.266695,-0.339333,-0.061059,-0.376933,...,0.059452,0.285154,0.063436,1.162927,-0.837117,-0.231743,-0.131206,-0.448204,-0.266107,0.112765,-0.537655,0.243628,-0.080022,-0.357984,-0.06287,0.026959,-0.225966,1.003935,-0.423095,0.429774,-0.103236,-0.606424,-0.152112,-0.216584,0.111076,-0.447714,0.198011,0.199941,-0.228155,-0.466801,-0.267862,0.069625,-0.561907,-0.33497,0.777656,0.084498,0.572794,-1.140565,0.218136,0.45798


In [None]:
# predict
X_elmo_test = elmo_test_df_combined.drop(columns=['token', 'complexity', 'id', 'corpus', 'sentence'])
Y_elmo_pred = lr.predict(X_elmo_test)

In [None]:
print(len(Y_elmo_pred))
print(len(Y_elmo_test))

917
917


In [None]:
# test loss (average absolute loss)
num = len(Y_elmo_pred)
losses = []
for i in range(num):
  loss = abs(Y_elmo_pred[i] - Y_elmo_test[i])
  losses.append(loss)
abl = sum(losses) / num
print("average test absolute loss is " + str(abl))

average test absolute loss is 0.0805725685296934
