## Importing the Data

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 18.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 49.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 50.6MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [2]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import (TensorDataset, DataLoader, SequentialSampler)
from transformers import BertModel, BertTokenizer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Constants
DATASET_DIR = '/content/drive/MyDrive/ColabNotebooks/AES-for-Korean/data'
SAVE_DIR = '/content/drive/MyDrive/ColabNotebooks/AES-for-Korean/data'

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel.tsv'), sep='\t', encoding='ISO-8859-1')
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [None]:
X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


Minimum and Maximum Scores for each essay set.

In [None]:
minimum_scores = np.array([-1, 2, 1, 0, 0, 0, 0, 0, 0])
maximum_scores = np.array([-1, 12, 6, 3, 3, 4, 4, 30, 60])

In [None]:
old_min = minimum_scores[X['essay_set']]
old_max = maximum_scores[X['essay_set']]
old_range = old_max - old_min
new_min = 0
new_max = 1
new_range = (new_max - new_min)  
X['score'] = (((X['domain1_score'] - old_min) * new_range) / old_range) + new_min

# round score to nearest integer for cohen kappa calculation
y = X['score']

X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,score
0,1,1,"Dear local newspaper, I think effects computer...",8,0.6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,0.7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,0.5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,0.8
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,0.6


## Preprocessing the Data

We will preprocess all essays and convert them to feature vectors so that they can be fed into the RNN.

These are all helper functions used to clean the essays.

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tokenized_sentences = essay_to_wordlist(raw_sentence, remove_stopwords)
            if tokenized_sentences:
                sentences.append(tokenized_sentences)
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    for word in words:
        if word in model:
            num_words += 1
            featureVec = np.add(featureVec, model[word])       
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def makeFeatureVec2(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    for word in words:
        if word in model:
            featureVec = np.add(featureVec, model[word])
    if len(words) != 0:
        featureVec = np.divide(featureVec,float(len(words)))
    return featureVec

def makeFeatureVec3(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = []
    for word in words:
        if word in model:
            featureVec.append(np.array(model[word], dtype="float32"))
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for glove model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

def getAvgFeatureVecs2(essay, model, num_features):
    """Main function to generate the word vectors for glove model."""
    essayFeatureVecs = np.zeros((len(essay),num_features),dtype="float32")
    for cnt, sentence in enumerate(essay):
        essayFeatureVecs[cnt] = makeFeatureVec2(sentence, model, num_features)
    return essayFeatureVecs

In [None]:
pretrained_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
model = BertModel.from_pretrained(pretrained_model_name).cuda()
#model.train()

model.eval()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
from sklearn.model_selection import KFold
from keras.preprocessing.sequence import pad_sequences

essay_data = X['essay']
is_remove_stopwords = False
tokenized_essay = []
padded_tokenized_essay = []
attention_mask_essay = []
sent_max_len = 200
for ix, essay in enumerate(essay_data):
  if ix % 1000 == 0:
    print(ix)
  sentences = essay_to_sentences(essay, remove_stopwords=is_remove_stopwords)
  tokenized_sentences = []
  for iy, sentence in enumerate(sentences):
      tokenized_sentence = np.array(tokenizer.encode(sentence, add_special_tokens=True))
      tokenized_sentences.append(tokenized_sentence)
  padded_tokenized_sentences = pad_sequences(tokenized_sentences, maxlen=sent_max_len, padding='post')
  attention_mask_sentences = np.where(padded_tokenized_sentences != 0, 1, 0)
  #padded_tokenized_sentences = torch.tensor(pad_sequences(tokenized_sentences, maxlen=sent_max_len, padding='post'))
  #attention_mask_sentences = torch.tensor(np.where(padded_tokenized_sentences != 0, 1, 0))
  padded_tokenized_essay.append(padded_tokenized_sentences)
  attention_mask_essay.append(attention_mask_sentences)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000


In [None]:
tttt_essays = []
tttt = 0
max_tttt = 0
for essay in essay_data:
  tttt_tmp = essay_to_sentences(essay, remove_stopwords=False)
  tttt_essays.append(tttt_tmp)
  tttt += len(tttt_tmp)
  if len(tttt_tmp) > max_tttt:
    max_tttt = len(tttt_tmp)

In [None]:
padded_tokenized_essay[0].shape

(16, 200)

In [None]:
print(max_tttt)
print(X['essay'][0])
print(tttt_essays[0])

96
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econ

# BERT Embedding

You need to execute this section only once.

In [None]:
import csv
ff = open(os.path.join(SAVE_DIR, 'embedded_features_special_tokens.csv'), 'a', newline='')
writer_ff = csv.writer(ff)

In [None]:
import psutil

with torch.no_grad():
  print("ix, gpu memory, cpu usage, vm usage, data size ")
  for ix in range(len(padded_tokenized_essay)):
    if ix % 100 == 0:
      print(ix, torch.cuda.memory_allocated(0), psutil.cpu_percent(), psutil.virtual_memory()[2])
    sen = torch.tensor(padded_tokenized_essay[ix]).cuda()
    mask = torch.tensor(attention_mask_essay[ix]).cuda()
    last_hidden_states_train = model(sen, attention_mask=mask)
    embedded_features = last_hidden_states_train[0].detach().cpu()[:, 0, :].numpy()
    # embedded_essay.append(embedded_features)
    for i in embedded_features:
      writer_ff.writerow(i)
    #np.savetxt(os.path.join(SAVE_DIR, 'embedded_features.csv'), embedded_features, delimiter=",")
    del sen, mask, last_hidden_states_train, embedded_features
    torch.cuda.empty_cache()

torch.cuda.empty_cache()

ix, gpu memory, cpu usage, vm usage, data size 
0 439065600 31.3 17.0
100 439065600 53.2 16.9
200 439065600 53.1 16.9
300 439065600 53.2 16.9
400 439065600 53.2 16.9
500 439065600 53.3 16.9
600 439065600 53.1 16.9
700 439065600 53.2 16.9
800 439065600 53.3 16.9
900 439065600 53.1 16.9
1000 439065600 53.2 16.9
1100 439065600 53.2 16.9
1200 439065600 53.1 16.9
1300 439065600 53.2 16.9
1400 439065600 53.2 16.9
1500 439065600 53.2 16.9
1600 439065600 53.3 16.9
1700 439065600 53.2 16.9
1800 439065600 53.2 16.9
1900 439065600 53.2 16.9
2000 439065600 53.3 16.9
2100 439065600 53.3 16.9
2200 439065600 53.2 16.9
2300 439065600 53.3 16.9
2400 439065600 53.3 16.9
2500 439065600 53.4 16.9
2600 439065600 53.3 16.9
2700 439065600 53.4 16.9
2800 439065600 53.4 16.9
2900 439065600 53.4 16.9
3000 439065600 53.3 16.9
3100 439065600 53.1 16.9
3200 439065600 53.3 16.9
3300 439065600 53.2 16.9
3400 439065600 53.3 16.9
3500 439065600 53.2 16.9
3600 439065600 53.4 16.9
3700 439065600 53.4 16.9
3800 439065600

In [None]:
ff.close()

# Load the embedded data

In [None]:
embedded_essay_raw = pd.read_csv(os.path.join(DATASET_DIR, 'embedded_features_special_tokens.csv'), sep=',', encoding='ISO-8859-1')
#embedded_essay_raw = pd.read_csv(os.path.join(DATASET_DIR, 'embedded_features_raw.csv'), sep=',', encoding='ISO-8859-1')
#embedded_essay_raw = pd.read_csv(os.path.join(DATASET_DIR, 'embedded_features.csv'), sep=',', encoding='ISO-8859-1')
embedded_essay = []

In [None]:
embedded_essay_raw.shape

(164770, 768)

In [None]:
from keras.preprocessing.sequence import pad_sequences

tmp_ix = 0
for ix, essay_raw in enumerate(padded_tokenized_essay):
  if ix % 500 == 0:
    print(ix)
  tmp_len = len(essay_raw)
  essay = embedded_essay_raw[tmp_ix:tmp_ix + tmp_len]
  embedded_essay.append(essay)
  tmp_ix += tmp_len

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500


In [None]:
#embedded_essay = pad_sequences(embedded_essay, maxlen=96, padding='pre', dtype='float')

Now we train the model on the dataset.

We will use 5-Fold Cross Validation and measure the Quadratic Weighted Kappa for each fold.
We will then calculate Average Kappa for all the folds.

## Defining the model 

Here we define a 2-Layer LSTM Model. 

Note that instead of using sigmoid activation in the output layer we will use
Relu since we are not normalising training labels.

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(200, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 200], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()

    return model

def get_sentence_model():
    """Define the model."""
    model = Sequential()
    model.add(GRU(128, dropout=0.4, input_shape=[128, 768], return_sequences=True))
    model.add(GRU(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()

    return model

def get_sentence_model2():
    """Define the model."""
    model = Sequential()
    model.add(GRU(96, dropout=0.4, input_shape=[128, 768], return_sequences=True))
    model.add(GRU(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()

    return model

def get_sentence_model3():
    """Define the model."""
    model = Sequential()
    model.add(GRU(256, dropout=0.4, input_shape=[128, 768], return_sequences=True))
    model.add(GRU(128, dropout=0.4, return_sequences=True))
    model.add(GRU(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()

    return model

## Fine-tuning BERT + GRU 2 Layer with PyTorch

In [None]:
import torch.nn as nn
from transformers import AutoConfig, AutoModel

class 


## Freezed Bert + GRU 2 Layer with Keras

In [None]:
#embedded_essay = pad_sequences(embedded_essay, maxlen=96, padding='pre', dtype='float')

import keras
import math

class DataGenerator(keras.utils.Sequence):

  def __init__(self, ids, batch_size=64, shuffle=True):
    self.ids = ids
    self.batch_size = batch_size
    self.shuffle = shuffle
    self.on_epoch_end()

  def on_epoch_end(self):
    # Updates indexes after each epoch
    self.indexes = np.arange(len(self.ids))
    if self.shuffle == True:
      np.random.shuffle(self.indexes)

  def __len__(self):
    return math.ceil(len(self.ids) / self.batch_size)

  def __getitem__(self, index):
    # Generated data containing batch_size samples
    batch_ids = self.ids[index * self.batch_size:(index + 1) * self.batch_size]

    essays = list()
    scores = list()
    for ix in batch_ids:
      essay = embedded_essay[ix]
      score = y[ix]
      essays.append(essay)
      scores.append(score)
    essays = pad_sequences(essays, maxlen=128, padding='pre', dtype='float')

    return np.array(essays), np.array(scores)

In [None]:
y
len(embedded_essay)

12976

In [None]:
embedded_essay[0]

Unnamed: 0,0.03897584,0.1705755,0.16306049,-0.32822582,-0.4931742,-0.5777212,0.51011056,0.8981184,0.15726711,-0.63853836,-0.16852458,-0.2539917,0.0052978797,0.3112666,0.30889437,-0.23946148,-0.5916331,0.5375616,0.10453381,0.25923082,0.040120613,-0.35811937,0.084900536,-0.12979448,0.20781113,-0.24564661,-0.21821417,-0.18564206,-0.10883415,0.19866161,-0.2736097,0.4945069,-0.06111347,0.09564236,0.044967655,-0.49912897,0.44235414,0.23340337,0.18972217,0.2369034,...,0.17444798,-0.7208587,0.2390435,0.5639985,-0.02137434,-0.05141661,-0.030394815,-0.104016826,-0.5468714,-0.16815141,-0.19427754,0.38222885,0.12647282,0.095467836,-0.37149695,0.43112832,0.28849825,0.55694956,0.24611443,-0.053098254,-0.10154303,-0.17519955,0.4287814,0.5200037,-5.403975,0.04553558,-0.12436247,0.0050980854,-0.22526489,-0.3898146,-0.115320064,-0.37885273,0.02988772,-0.12692152,0.37245676,0.12877496,-0.19407533,-0.6573706,0.3568359,0.39976805
0,-0.031,-0.039242,-0.320096,0.006946,-0.196671,-0.396228,0.115823,0.360166,-0.09667,0.039806,0.451213,-0.267765,0.124315,0.249646,-0.367936,-0.043657,-0.022469,0.261491,0.099743,-0.21362,-0.189256,-0.105234,-0.299287,0.109172,-0.152034,-0.175036,0.098903,-0.160676,0.029059,0.270451,0.076392,-0.046378,-0.277001,0.104086,-0.090018,-0.100922,0.13168,0.024087,-0.158221,0.047898,...,-0.010769,0.147479,-0.074286,0.275063,-0.389159,0.159046,-0.019297,-0.175742,0.086575,-0.11255,-0.193133,0.107199,-0.088026,0.029302,0.194172,0.284729,0.118845,0.067868,0.018238,0.029295,-0.110705,-0.286266,-0.202426,-0.104493,-9.110782,-0.043799,-0.111329,0.024805,0.03779,0.062525,0.081043,-0.333182,0.135394,0.122937,-0.028214,-0.039936,-0.21177,-0.09139,0.235144,0.06014
1,0.270068,0.286835,-0.312244,-0.083724,-0.371326,-0.175925,0.310784,0.115957,-0.070045,-0.255145,-0.010835,0.018113,-0.028804,0.281229,0.261835,-0.04765,-0.139911,0.303671,0.40124,0.039095,0.122667,-0.133192,-0.07331,-0.188598,-0.249804,-0.148067,0.042947,-0.225651,0.076616,-0.057281,0.214792,0.259625,-0.402984,0.111978,0.161541,0.090459,0.094468,0.162238,0.264531,0.189335,...,-0.247232,-0.155336,-0.072469,0.505717,-0.052829,0.117284,-0.146075,-0.117058,-0.195858,0.039858,-0.382877,0.371591,0.139386,-0.021546,0.052934,0.125495,0.533277,0.321418,0.190057,-0.311492,-0.189408,-0.094566,0.422168,0.430083,-7.357372,0.060102,-0.136205,-0.376616,-0.04226,-0.169703,0.170489,-0.201935,0.0449,-0.207599,0.126979,0.269769,-0.073095,-0.206223,0.433744,0.234084
2,0.044999,0.103974,-0.273873,0.06271,-0.203671,-0.184251,0.252243,0.580784,-0.227311,-0.122951,0.451432,0.02015,0.049043,0.352111,0.256963,0.119588,-0.190546,0.499676,0.427755,0.083106,-0.256669,-0.257225,-0.117288,0.003841,0.30516,-0.194973,-0.085303,-0.172681,0.266695,-0.238845,-0.007221,0.306875,0.022228,-0.02796,0.263643,-0.086819,0.064743,-0.166808,0.038233,0.162414,...,0.323896,-0.070453,-0.169819,0.171146,0.061148,0.177073,-0.132677,-0.064897,-0.192132,-0.25693,-0.395001,0.397926,0.11996,0.015796,-0.011583,0.267547,0.597411,0.016349,0.398271,0.018641,-0.199599,-0.225537,0.404656,0.197267,-8.405067,0.052658,-0.291597,-0.41422,-0.030366,-0.260031,0.149828,-0.356936,-0.014905,-0.132931,0.180756,0.205888,-0.224388,-0.503448,0.048857,0.244911
3,0.017118,0.121252,-0.644177,-0.120316,-0.136243,-0.494843,0.447419,0.686055,-0.444794,-0.281085,0.09009,-0.23336,-0.042814,0.035806,0.235632,-0.089441,-0.00072,0.561433,0.362729,0.034237,-0.137094,-0.155068,-0.09951,-0.21667,-0.217682,-6.2e-05,-0.171357,-0.095839,0.192478,-0.20474,-0.212839,0.535659,-0.343187,0.064332,0.146969,0.006006,0.191289,0.089183,0.103925,0.000242,...,0.219435,-0.385291,0.14175,0.083287,-0.065311,0.208528,-0.201536,-0.0644,-0.075216,0.059751,-0.390485,0.333034,0.330601,0.071649,-0.061794,0.281069,0.285943,0.1436,0.656836,-0.196885,-0.124114,-0.092115,0.813195,0.426665,-7.306691,-0.044458,-0.576241,-0.317755,-0.150216,-0.361833,0.193215,-0.388127,0.219215,-0.10105,0.335746,0.036126,-0.130592,-0.519933,0.330751,0.133035
4,0.120396,-0.007655,0.141095,-0.126055,-0.614516,-0.575045,0.409321,0.694351,-0.102841,-0.35771,-0.11036,-0.19432,0.1621,0.205439,0.172931,-0.155892,-0.119357,0.824795,0.201161,-0.07072,-0.113191,-0.431878,0.312597,-0.259597,-0.187396,0.089399,-0.034187,-0.286472,0.200937,0.183425,0.052091,-0.007813,0.146523,0.149793,0.259749,-0.357512,0.372107,0.203365,0.46585,0.209063,...,0.081589,-0.391451,0.185456,0.415927,-0.321061,-0.070721,-0.231353,-0.135785,-0.247936,-0.287477,-0.349191,0.20508,0.151861,-0.01755,-0.429742,0.257255,0.040498,0.510252,0.492107,-0.220305,-0.234759,-0.011245,0.192274,0.427723,-6.496952,-0.078097,-0.173186,-0.174259,-0.266402,-0.830452,-0.207477,-0.43018,0.164151,-0.251946,0.283212,0.026835,-0.079621,-0.697007,0.365411,0.88804
5,0.306679,-0.19756,0.229467,-0.058572,-0.187343,-0.733091,0.240553,0.565845,-0.058151,-0.468631,0.368518,-0.013291,-0.050753,0.34404,-0.126581,0.009933,-0.08932,0.0752,0.510305,0.056196,-0.180038,-0.266415,0.195196,0.026772,0.086859,0.084377,-0.09978,0.075754,0.116565,0.040138,-0.176776,-0.008303,-0.035012,-0.190269,0.102449,0.183731,0.00376,0.185053,0.090468,0.258771,...,-0.034923,-0.054713,0.029445,0.137261,0.138442,0.285577,0.108432,-0.076638,-0.372027,0.037054,-0.120139,0.035255,-0.183725,0.10195,-0.261305,0.427368,-0.045248,0.191705,0.011393,-0.193674,0.233372,-0.07769,-0.244265,-0.184768,-6.755476,0.025335,-0.015197,0.288077,0.009024,-0.351086,-0.446926,-0.255326,-0.099235,-0.154914,0.170366,-0.141866,-0.095921,-0.277485,0.227058,0.238263
6,0.332425,0.003333,-0.251256,0.18214,-0.165938,-0.275331,0.182975,0.16079,-0.379758,-0.244285,0.099403,-0.457855,0.076864,0.203388,-0.064996,0.001619,-0.138622,0.447935,0.076835,-0.027965,-0.167361,-0.270469,0.043183,-0.081642,-0.368688,-0.008497,-0.061087,0.11547,-0.040156,0.069567,-0.11265,0.463714,-0.178319,0.366436,0.104355,-0.093582,-0.124677,0.02266,0.122956,-0.043008,...,-0.196851,-0.372139,-0.298602,0.242114,0.014124,0.187351,-0.268485,-0.143034,-0.191448,-0.061732,-0.296879,0.237495,0.08751,-0.045395,0.04647,0.119084,0.546684,0.387078,0.070079,0.007279,-0.383547,0.025572,0.393464,0.459205,-7.115532,0.15511,-0.115857,-0.21886,-0.282082,-0.162134,-0.151747,-0.435132,-0.042474,-0.070441,0.218808,0.030028,-0.018009,-0.170354,0.200916,0.316067
7,0.385488,0.147821,0.381968,0.111567,-0.290401,-0.659185,0.516419,0.889231,0.062068,-0.557178,0.305354,-0.089696,0.333135,0.707216,-0.246121,-0.178713,-0.258887,0.283793,0.259925,-0.131811,0.095022,-0.216171,0.236896,0.100517,-0.190133,-0.105484,0.068192,0.205345,0.022568,-0.028361,0.281271,0.171402,-0.441261,0.366951,-0.015714,-0.135328,-0.213696,0.30499,0.100781,0.035153,...,0.011131,-0.073032,-0.199011,-0.197592,0.012442,0.013998,-0.086391,0.063594,-0.419786,-0.096817,-0.302082,0.328988,0.303028,0.02637,-0.323927,0.52923,-0.222225,0.463107,-0.02035,-0.060802,0.194464,-0.229696,-0.07428,0.245858,-5.102902,-0.032969,0.148119,0.046616,-0.488129,-0.677289,-0.417181,-0.123907,-0.034119,0.002334,0.094393,-0.308248,0.29541,-0.562011,0.64293,0.391632
8,0.082006,-0.050099,0.251327,0.156538,0.01761,-0.185316,0.028851,0.348973,-0.106276,-0.394803,0.258577,-0.054708,0.216633,0.250994,-0.326224,-0.215606,0.003612,0.212238,0.183208,0.040695,0.16062,-0.418663,0.425706,-0.174038,0.089943,-0.082298,0.089739,0.035447,-0.113299,-0.072691,-0.152181,0.208996,0.177419,0.091984,-0.100145,0.052098,-0.045827,0.417051,0.136236,-0.06614,...,0.103624,-0.187602,-0.100075,-0.398236,0.187638,0.138101,-0.039383,-0.050993,-0.278638,-0.121926,-0.021772,0.074883,0.247438,0.312501,-0.069777,0.65576,-0.143605,0.109404,0.192065,-0.032839,0.099515,-0.087534,-0.146372,0.06823,-6.38638,-0.25454,-0.045042,-0.101088,-0.269359,-0.491501,-0.548597,-0.205302,0.059601,0.159329,0.079929,0.084331,-0.014158,-0.301412,0.455928,0.307292
9,0.234583,0.246118,0.052081,-0.023981,-0.382841,-0.771413,0.261784,0.953608,-0.078587,-0.21527,0.46661,0.265769,0.306016,0.110554,0.11679,-0.074714,-0.122824,0.343166,0.486176,-0.084548,0.01645,-0.212998,-0.099676,0.129987,0.453402,-0.083875,-0.261909,0.13157,0.322284,0.086199,-0.280919,0.055981,0.341003,-0.085578,0.383432,-0.493091,0.313817,0.076146,0.087825,0.188018,...,0.336105,-0.182269,0.106508,0.345822,-0.161164,0.03388,-0.209665,-0.211244,-0.627143,-0.413599,-0.361669,0.198772,0.260061,0.287019,-0.037754,0.497421,0.018599,0.160919,0.242704,0.079516,-0.168577,-0.239024,-0.194722,0.290189,-7.629736,-0.020406,0.020378,0.010418,0.191347,-0.701427,-0.479299,0.019886,0.138583,0.001411,0.362004,-0.098332,0.121972,-0.310039,0.310394,0.231359


In [None]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

batch_size = 64

cv = KFold(n_splits=5, shuffle=True)
cnt = 0
for traincv, testcv in cv.split(embedded_essay):
  train_gen = DataGenerator(traincv, batch_size=batch_size)
  test_gen = DataGenerator(testcv, batch_size=batch_size, shuffle=False)
  train_y = y.iloc[traincv]
  test_y = y.iloc[testcv]

  train_steps = len(traincv) // batch_size
  valid_steps = len(testcv) // batch_size

  early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
  sentence_model = get_sentence_model3()
  sentence_model.fit(train_gen, steps_per_epoch=train_steps, validation_steps=valid_steps,
                    epochs=50, callbacks=[early_stopping])

  y_sent_pred = sentence_model.predict(test_gen) *100
  y_sent_pred = np.round(y_sent_pred)

  sentence_result = cohen_kappa_score(np.array(np.round(test_y * 100)), y_sent_pred, weights='quadratic')
  print("Kappa Score", cnt, ": {}".format(sentence_result))
  cnt += 1

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, 128, 256)          787968    
_________________________________________________________________
gru_1 (GRU)                  (None, 128, 128)          148224    
_________________________________________________________________
gru_2 (GRU)                  (None, 64)                37248     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 973,505
Trainable params: 973,505
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch

In [None]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

batch_size = 64

cv = KFold(n_splits=5, shuffle=True)
cnt = 0
for traincv, testcv in cv.split(embedded_essay):
  train_gen = DataGenerator(traincv, batch_size=batch_size)
  test_gen = DataGenerator(testcv, batch_size=batch_size, shuffle=False)
  train_y = y.iloc[traincv]
  test_y = y.iloc[testcv]

  train_steps = len(traincv) // batch_size
  valid_steps = len(testcv) // batch_size

  early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
  sentence_model = get_sentence_model3()
  sentence_model.fit(train_gen, steps_per_epoch=train_steps, validation_steps=valid_steps,
                    epochs=50, callbacks=[early_stopping])

  y_sent_pred = sentence_model.predict(test_gen) *100
  y_sent_pred = np.round(y_sent_pred)

  sentence_result = cohen_kappa_score(np.array(np.round(test_y * 100)), y_sent_pred, weights='quadratic')
  print("Kappa Score", cnt, ": {}".format(sentence_result))
  cnt += 1

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_52 (GRU)                 (None, 128, 256)          787968    
_________________________________________________________________
gru_53 (GRU)                 (None, 128, 128)          148224    
_________________________________________________________________
gru_54 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_26 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 65        
Total params: 973,505
Trainable params: 973,505
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Ep

In [None]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

batch_size = 64

cv = KFold(n_splits=5, shuffle=True)
cnt = 0
for traincv, testcv in cv.split(embedded_essay):
  train_gen = DataGenerator(traincv, batch_size=batch_size)
  test_gen = DataGenerator(testcv, batch_size=batch_size, shuffle=False)
  train_y = y.iloc[traincv]
  test_y = y.iloc[testcv]

  train_steps = len(traincv) // batch_size
  valid_steps = len(testcv) // batch_size

  early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
  sentence_model = get_sentence_model()
  sentence_model.fit(train_gen, steps_per_epoch=train_steps, validation_steps=valid_steps,
                    epochs=50, callbacks=[early_stopping])

  y_sent_pred = sentence_model.predict(test_gen) *100
  y_sent_pred = np.round(y_sent_pred)

  sentence_result = cohen_kappa_score(np.array(np.round(test_y * 100)), y_sent_pred, weights='quadratic')
  print("Kappa Score", cnt, ": {}".format(sentence_result))
  cnt += 1

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_20 (GRU)                 (None, 96, 128)           344832    
_________________________________________________________________
gru_21 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_10 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 65        
Total params: 382,145
Trainable params: 382,145
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Ep

In [None]:
test_y = y.iloc[testcv]
sentence_result = cohen_kappa_score(np.array(np.round(test_y * 100)), y_sent_pred, weights='quadratic')
print("Kappa Score 0", ": {}".format(sentence_result))

Kappa Score 0 : 0.7346137426085717


In [None]:
print(np.array(np.round(test_y * 100))[0:100])
print(y_sent_pred[0:100])

[ 70.  60.  60.  70.  70.  70.  60.  80.  70.  20.  70.  60.  60.  80.
  60.  60.  70.  60.  60. 100.  60.  50.  50.  50.  70.  60.  60. 100.
  80.  60.  70.  80.  70.  80.  60.  60.  80.  60.  30.  70.  80.  60.
  80.  80.  60.  40.  40.  60.  70.  70.  70.  60.  60.  70.  50.  90.
  60.  60.  80.  80.  80.  70.  70.  80.  60.  60.  60.  60.  70.  90.
  60.  50.  40.  80.  60.  40.  60.  40.  80.  60.  60.  80.  70.  70.
  60.  60.  60. 100.  80.  60.  70.  60.  50. 100.  70.  60.  60.  70.
  60.  40.]
[[64.]
 [69.]
 [51.]
 [68.]
 [68.]
 [56.]
 [56.]
 [87.]
 [77.]
 [31.]
 [69.]
 [72.]
 [78.]
 [64.]
 [60.]
 [66.]
 [73.]
 [55.]
 [59.]
 [83.]
 [59.]
 [55.]
 [67.]
 [52.]
 [72.]
 [49.]
 [51.]
 [70.]
 [66.]
 [47.]
 [76.]
 [89.]
 [83.]
 [75.]
 [73.]
 [63.]
 [85.]
 [58.]
 [10.]
 [68.]
 [83.]
 [62.]
 [63.]
 [69.]
 [62.]
 [53.]
 [47.]
 [70.]
 [66.]
 [79.]
 [66.]
 [60.]
 [56.]
 [58.]
 [57.]
 [72.]
 [66.]
 [58.]
 [83.]
 [64.]
 [84.]
 [78.]
 [71.]
 [59.]
 [52.]
 [67.]
 [54.]
 [66.]
 [65.]
 [74.]
 

## Training Phase - Sentence

문장 단위, 모델 돌리는 부분만 (전처리는 위에서)