## Importing the Data

In [1]:
import os
import pandas as pd
import numpy as np

import torch
from transformers import BertModel, BertTokenizer

In [2]:
# Constants
DATASET_DIR = './data/'
SAVE_DIR = './'

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel.tsv'), sep='\t', encoding='ISO-8859-1')
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [3]:
X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


Minimum and Maximum Scores for each essay set.

In [4]:
minimum_scores = np.array([-1, 2, 1, 0, 0, 0, 0, 0, 0])
maximum_scores = np.array([-1, 12, 6, 3, 3, 4, 4, 30, 60])

In [5]:
old_min = minimum_scores[X['essay_set']]
old_max = maximum_scores[X['essay_set']]
old_range = old_max - old_min
new_min = 0
new_max = 1
new_range = (new_max - new_min)  
X['score'] = (((X['domain1_score'] - old_min) * new_range) / old_range) + new_min

# round score to nearest integer for cohen kappa calculation
y = np.round(X['score'])

X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,score
0,1,1,"Dear local newspaper, I think effects computer...",8,0.6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,0.7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,0.5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,0.8
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,0.6


## Preprocessing the Data

We will preprocess all essays and convert them to feature vectors so that they can be fed into the RNN.

These are all helper functions used to clean the essays.

In [6]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tmax\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tmax\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [43]:
import numpy as np
import re
from nltk.corpus import stopwords

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tokenized_sentences = essay_to_wordlist(raw_sentence, remove_stopwords)
            if tokenized_sentences:
                sentences.append(tokenized_sentences)
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    for word in words:
        if word in model:
            num_words += 1
            featureVec = np.add(featureVec, model[word])       
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def makeFeatureVec2(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    for word in words:
        if word in model:
            featureVec = np.add(featureVec, model[word])
    if len(words) != 0:
        featureVec = np.divide(featureVec,float(len(words)))
    return featureVec

def makeFeatureVec3(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = []
    for word in words:
        if word in model:
            featureVec.append(np.array(model[word], dtype="float32"))
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for glove model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

def getAvgFeatureVecs2(essay, model, num_features):
    """Main function to generate the word vectors for glove model."""
    essayFeatureVecs = np.zeros((len(essay),num_features),dtype="float32")
    for cnt, sentence in enumerate(essay):
        essayFeatureVecs[cnt] = makeFeatureVec2(sentence, model, num_features)
    return essayFeatureVecs

In [8]:
pretrained_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
model = BertModel.from_pretrained(pretrained_model_name)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [101]:
tokenized_essay[0].

(16, 768)

In [99]:
from sklearn.model_selection import KFold
from keras.preprocessing.sequence import pad_sequences

cv = KFold(n_splits = 5, shuffle = True)

essay_data = X['essay']
is_remove_stopwords = True
tokenized_essay = []
sent_max_len = 200
for ix, essay in enumerate(essay_data):
    #if ix % 1000 == 0:
    #print(ix)
    sentences = essay_to_sentences(essay, remove_stopwords=is_remove_stopwords)
    tokenized_sentences = []
    for iy, sentence in enumerate(sentences):
        if len_max < len(sentence):
            len_max = len(sentence)
        tokenized_sentence = np.array(tokenizer.encode(sentence, add_special_tokens=False))
        tokenized_sentences.append(tokenized_sentence)
    padded_tokenized_sentences = torch.tensor(pad_sequences(tokenized_sentences, maxlen=sent_max_len, padding='post'))
    attention_mask_sentences = torch.tensor(np.where(padded_tokenized_sentences != 0, 1, 0))
    
    with torch.no_grad():
        last_hidden_states_train = model(padded_tokenized_sentences, attention_mask=attention_mask_sentences)
    embedded_features = last_hidden_states_train[0][:, 0, :].numpy()
    tokenized_essay.append(embedded_features)
print(len_max)
#sentences = pd.Series(sentences)
#sentences.head()

0
1
2
3
4
5
6
7


KeyboardInterrupt: 

In [94]:
attention_mask_essay[0]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [91]:
last_hidden_states_train = model(tokenized_essay[0], attention_mask=attention_mask_essay[0])
train_features = last_hidden_states_train[0][:, 0, :].numpy()

TypeError: 'int' object is not callable

Now we train the model on the dataset.

We will use 5-Fold Cross Validation and measure the Quadratic Weighted Kappa for each fold.
We will then calculate Average Kappa for all the folds.

In [109]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True)
num_features = 200

trainData_sent = []
testData_sent = []
y_trainData_sent = []
y_testData_sent = []
for traincv, testcv in cv.split(X):
    print('##Fold Started')
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    y_trainData_sent.append(y_train)
    y_testData_sent.append(y_test)
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']

    trainDataVecs = []
    testDataVecs = []
    
    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        sentences = essay_to_sentences(essay, remove_stopwords = True)
        trainDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
        trainDataVecs.append(np.array(trainDataVec, dtype="float32"))

    for essay in test_essays:
        # Obtaining all sentences from the training essays.
        sentences = essay_to_sentences(essay, remove_stopwords = True)
        testDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
        testDataVecs.append(np.array(testDataVec, dtype="float32"))
        
    trainDataVecs = pad_sequences(trainDataVecs, maxlen=128, padding='pre', dtype='float')
    testDataVecs = pad_sequences(testDataVecs, maxlen=128, padding='pre', dtype='float')
    trainData_sent.append(np.array(trainDataVecs, dtype="float32"))
    testData_sent.append(np.array(testDataVecs, dtype="float32"))
    print(len(trainDataVecs))
    print(len(testDataVecs))

##Fold Started
10380
2596
##Fold Started
10381
2595
##Fold Started
10381
2595
##Fold Started
10381
2595
##Fold Started
10381
2595


In [110]:
tttt

0        Dear local newspaper, I think effects computer...
1        Dear @CAPS1 @CAPS2, I believe that using compu...
2        Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...
3        Dear Local Newspaper, @CAPS1 I have found that...
4        Dear @LOCATION1, I know having computers has a...
                               ...                        
12971     In most stories mothers and daughters are eit...
12972     I never understood the meaning laughter is th...
12973    When you laugh, is @CAPS5 out of habit, or is ...
12974                                   Trippin' on fen...
12975     Many people believe that laughter can improve...
Name: essay, Length: 12976, dtype: object

## Defining the model 

Here we define a 2-Layer LSTM Model. 

Note that instead of using sigmoid activation in the output layer we will use
Relu since we are not normalising training labels.

In [111]:
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(200, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 200], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

def get_sentence_model():
    """Define the model."""
    model = Sequential()
    model.add(GRU(128, dropout=0.4, recurrent_dropout=0.4, input_shape=[128, 200], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()

    return model

def get_word_model():
    """Define the model."""
    model = Sequential()
    model.add(GRU(128, dropout=0.4, recurrent_dropout=0.4, input_shape=[512, 200], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [36]:
trainData_sent[0][0].shape

(128, 200)

In [85]:
for ix, dx in enumerate(trainData_sent):
    for iy, dy in enumerate(dx):
        if iy % 2000 == 0:
            print('check: ', ix, iy)
        cnt = 0
        for iz, dz in enumerate(dy):
            if dz.mean() == 0:
                cnt += 1
        if cnt < 32:
            print(ix, iy)

check:  0 0
check:  0 2000
check:  0 4000
check:  0 6000
check:  0 8000
check:  0 10000
check:  1 0
check:  1 2000
check:  1 4000
check:  1 6000
check:  1 8000
check:  1 10000
check:  2 0
check:  2 2000
check:  2 4000
check:  2 6000
check:  2 8000
check:  2 10000
check:  3 0
check:  3 2000
check:  3 4000
check:  3 6000
check:  3 8000
check:  3 10000
check:  4 0
check:  4 2000
check:  4 4000
check:  4 6000
check:  4 8000
check:  4 10000


In [93]:
tttt = X['essay']
tttt_max = 0
tttt_ix = 0
for ix, essay in enumerate(tttt):
    if ix % 1000 == 0:
        print('check: ', ix)
    # Obtaining all sentences from the training essays.
    sentences = essay_to_sentences(essay, remove_stopwords = True)
    if tttt_max < len(sentences):
        tttt_max = len(sentences)
        tttt_ix = ix

check:  0
check:  1000
check:  2000
check:  3000
check:  4000
check:  5000
check:  6000
check:  7000
check:  8000
check:  9000
check:  10000
check:  11000
check:  12000


In [94]:
print(tttt_max, tttt_ix)

96 12436


## Training Phase - Sentence

문장 단위, 모델 돌리는 부분만 (전처리는 위에서)

In [None]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

sentence_models = []
sentence_results = []
for cnt in range(5):
    
    print("\n--------Fold {}--------\n".format(cnt))
    sentence_model = get_sentence_model()
    sentence_model.fit(trainData_sent[cnt], y_trainData_sent[cnt], batch_size=64, epochs=50, callbacks=[early_stopping])

    y_sent_pred = sentence_model.predict(testData_sent[cnt]) *100

    # Round y_pred to the nearest integer.
    y_sent_pred2 = np.round(y_sent_pred)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    sentence_result = cohen_kappa_score(y_testData_sent[cnt].values * 100, y_sent_pred2, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(sentence_result))
    sentence_results.append(sentence_result)
    sentence_models.append(sentence_model)

print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(sentence_results).mean(),decimals=4))

if np.round(np.array(sentence_results).mean(),decimals=4) > 0.75:
    sentence_models[sentence_results.index(max(sentence_results))].save('./final_gru_sent.h5')


--------Fold 0--------

Model: "sequential_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_62 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_63 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_39 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50