## Importing the Data

In [1]:
import os
import pandas as pd
import numpy as np

import torch
from transformers import BertModel, BertTokenizer

In [2]:
# Constants
DATASET_DIR = './data/'
SAVE_DIR = './'

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel.tsv'), sep='\t', encoding='ISO-8859-1')
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [3]:
X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


Minimum and Maximum Scores for each essay set.

In [4]:
minimum_scores = np.array([-1, 2, 1, 0, 0, 0, 0, 0, 0])
maximum_scores = np.array([-1, 12, 6, 3, 3, 4, 4, 30, 60])

In [5]:
old_min = minimum_scores[X['essay_set']]
old_max = maximum_scores[X['essay_set']]
old_range = old_max - old_min
new_min = 0
new_max = 1
new_range = (new_max - new_min)  
X['score'] = (((X['domain1_score'] - old_min) * new_range) / old_range) + new_min

# round score to nearest integer for cohen kappa calculation
y = np.round(X['score'])

X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,score
0,1,1,"Dear local newspaper, I think effects computer...",8,0.6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,0.7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,0.5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,0.8
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,0.6


## Preprocessing the Data

We will preprocess all essays and convert them to feature vectors so that they can be fed into the RNN.

These are all helper functions used to clean the essays.

In [6]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tmax\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tmax\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [43]:
import numpy as np
import re
from nltk.corpus import stopwords

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tokenized_sentences = essay_to_wordlist(raw_sentence, remove_stopwords)
            if tokenized_sentences:
                sentences.append(tokenized_sentences)
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    for word in words:
        if word in model:
            num_words += 1
            featureVec = np.add(featureVec, model[word])       
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def makeFeatureVec2(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    for word in words:
        if word in model:
            featureVec = np.add(featureVec, model[word])
    if len(words) != 0:
        featureVec = np.divide(featureVec,float(len(words)))
    return featureVec

def makeFeatureVec3(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = []
    for word in words:
        if word in model:
            featureVec.append(np.array(model[word], dtype="float32"))
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for glove model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

def getAvgFeatureVecs2(essay, model, num_features):
    """Main function to generate the word vectors for glove model."""
    essayFeatureVecs = np.zeros((len(essay),num_features),dtype="float32")
    for cnt, sentence in enumerate(essay):
        essayFeatureVecs[cnt] = makeFeatureVec2(sentence, model, num_features)
    return essayFeatureVecs

In [8]:
pretrained_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
model = BertModel.from_pretrained(pretrained_model_name)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [86]:
from sklearn.model_selection import KFold
from keras.preprocessing.sequence import pad_sequences

cv = KFold(n_splits = 5, shuffle = True)

essay_data = X['essay']
is_remove_stopwords = True
tokenized_essay = []
attention_mask_essay = []
sent_max_len = 200
for ix, essay in enumerate(essay_data):
    if ix % 1000 == 0:
        print(ix)
    sentences = essay_to_sentences(essay, remove_stopwords=is_remove_stopwords)
    tokenized_sentences = []
    for iy, sentence in enumerate(sentences):
        if len_max < len(sentence):
            len_max = len(sentence)
        tokenized_sentence = np.array(tokenizer.encode(sentence, add_special_tokens=False), dtype="float")
        tokenized_sentences.append(tokenized_sentence)
    padded_tokenized_sentences = pad_sequences(tokenized_sentences, maxlen=sent_max_len, padding='post', dtype="float")
    attention_mask_sentences = np.where(padded_tokenized_sentences != 0, 1, 0)
    print(torch.tensor(padded_tokenized_sentences))
    tokenized_essay.append(padded_tokenized_sentences)
    attention_mask_essay.append(attention_mask_sentences)
    if ix == 2:
        break
print(len_max)
#sentences = pd.Series(sentences)
#sentences.head()

0
tensor([[6203., 2334., 3780.,  ...,    0.,    0.,    0.],
        [2518.,    0.,    0.,  ...,    0.,    0.,    0.],
        [ 100., 2228.,    0.,  ...,    0.,    0.,    0.],
        ...,
        [2738., 2775., 3274.,  ...,    0.,    0.,    0.],
        [3246., 2584., 2391.,  ...,    0.,    0.,    0.],
        [4067., 5962.,    0.,  ...,    0.,    0.,    0.]], dtype=torch.float64)
tensor([[ 6203.,  9700.,  9700.,  ...,     0.,     0.,     0.],
        [ 2478.,  7588.,  2393.,  ...,     0.,     0.,     0.],
        [ 2036.,  7588.,  5770.,  ...,     0.,     0.,     0.],
        ...,
        [ 3246.,  9491., 19209.,  ...,     0.,     0.,     0.],
        [ 2154.,  3662.,  3566.,  ...,     0.,     0.,     0.],
        [ 2175.,  4965.,  3274.,  ...,     0.,     0.,     0.]],
       dtype=torch.float64)
tensor([[6203., 9700., 9700.,  ...,    0.,    0.,    0.],
        [2490., 9849., 2974.,  ...,    0.,    0.,    0.],
        [2500., 2367., 4784.,  ...,    0.,    0.,    0.],
        ...,
  

In [79]:
np.where(tokenized_essay != 0, 1, 0)

array(1)

In [73]:
torch.tensor(np.where(tokenized_essay != 0, 1, 0))

tensor(1, dtype=torch.int32)

Now we train the model on the dataset.

We will use 5-Fold Cross Validation and measure the Quadratic Weighted Kappa for each fold.
We will then calculate Average Kappa for all the folds.

In [109]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True)
num_features = 200

trainData_sent = []
testData_sent = []
y_trainData_sent = []
y_testData_sent = []
for traincv, testcv in cv.split(X):
    print('##Fold Started')
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    y_trainData_sent.append(y_train)
    y_testData_sent.append(y_test)
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']

    trainDataVecs = []
    testDataVecs = []
    
    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        sentences = essay_to_sentences(essay, remove_stopwords = True)
        trainDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
        trainDataVecs.append(np.array(trainDataVec, dtype="float32"))

    for essay in test_essays:
        # Obtaining all sentences from the training essays.
        sentences = essay_to_sentences(essay, remove_stopwords = True)
        testDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
        testDataVecs.append(np.array(testDataVec, dtype="float32"))
        
    trainDataVecs = pad_sequences(trainDataVecs, maxlen=128, padding='pre', dtype='float')
    testDataVecs = pad_sequences(testDataVecs, maxlen=128, padding='pre', dtype='float')
    trainData_sent.append(np.array(trainDataVecs, dtype="float32"))
    testData_sent.append(np.array(testDataVecs, dtype="float32"))
    print(len(trainDataVecs))
    print(len(testDataVecs))

##Fold Started
10380
2596
##Fold Started
10381
2595
##Fold Started
10381
2595
##Fold Started
10381
2595
##Fold Started
10381
2595


In [110]:
tttt

0        Dear local newspaper, I think effects computer...
1        Dear @CAPS1 @CAPS2, I believe that using compu...
2        Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...
3        Dear Local Newspaper, @CAPS1 I have found that...
4        Dear @LOCATION1, I know having computers has a...
                               ...                        
12971     In most stories mothers and daughters are eit...
12972     I never understood the meaning laughter is th...
12973    When you laugh, is @CAPS5 out of habit, or is ...
12974                                   Trippin' on fen...
12975     Many people believe that laughter can improve...
Name: essay, Length: 12976, dtype: object

## Defining the model 

Here we define a 2-Layer LSTM Model. 

Note that instead of using sigmoid activation in the output layer we will use
Relu since we are not normalising training labels.

In [111]:
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(200, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 200], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

def get_sentence_model():
    """Define the model."""
    model = Sequential()
    model.add(GRU(128, dropout=0.4, recurrent_dropout=0.4, input_shape=[128, 200], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()

    return model

def get_word_model():
    """Define the model."""
    model = Sequential()
    model.add(GRU(128, dropout=0.4, recurrent_dropout=0.4, input_shape=[512, 200], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [36]:
trainData_sent[0][0].shape

(128, 200)

In [85]:
for ix, dx in enumerate(trainData_sent):
    for iy, dy in enumerate(dx):
        if iy % 2000 == 0:
            print('check: ', ix, iy)
        cnt = 0
        for iz, dz in enumerate(dy):
            if dz.mean() == 0:
                cnt += 1
        if cnt < 32:
            print(ix, iy)

check:  0 0
check:  0 2000
check:  0 4000
check:  0 6000
check:  0 8000
check:  0 10000
check:  1 0
check:  1 2000
check:  1 4000
check:  1 6000
check:  1 8000
check:  1 10000
check:  2 0
check:  2 2000
check:  2 4000
check:  2 6000
check:  2 8000
check:  2 10000
check:  3 0
check:  3 2000
check:  3 4000
check:  3 6000
check:  3 8000
check:  3 10000
check:  4 0
check:  4 2000
check:  4 4000
check:  4 6000
check:  4 8000
check:  4 10000


In [93]:
tttt = X['essay']
tttt_max = 0
tttt_ix = 0
for ix, essay in enumerate(tttt):
    if ix % 1000 == 0:
        print('check: ', ix)
    # Obtaining all sentences from the training essays.
    sentences = essay_to_sentences(essay, remove_stopwords = True)
    if tttt_max < len(sentences):
        tttt_max = len(sentences)
        tttt_ix = ix

check:  0
check:  1000
check:  2000
check:  3000
check:  4000
check:  5000
check:  6000
check:  7000
check:  8000
check:  9000
check:  10000
check:  11000
check:  12000


In [94]:
print(tttt_max, tttt_ix)

96 12436


## Training Phase - Word

GPU가 생기면 돌릴 코드...data 단어 단위로 쪼개기 + 모델 돌리기

In [229]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

cv3 = KFold(n_splits=5, shuffle=True)
num_features = 200

word_model_cnt = 0
early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
word_models = []
word_results = []
for traincv, testcv in cv3.split(X):
    print("\n--------Fold {}--------\n".format(word_model_cnt))
    
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]

    
    train_essays = X_train['essay']
    test_essays = X_test['essay']

    trainDataVecs = []
    testDataVecs = []
    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        words = essay_to_wordlist(essay, remove_stopwords=True)
        trainDataVec = makeFeatureVec3(words, embedding_dict, num_features)
        trainDataVecs.append(np.array(trainDataVec, dtype="float32"))

    for essay in test_essays:
        # Obtaining all sentences from the training essays.
        words = essay_to_wordlist(essay, remove_stopwords=True)
        testDataVec = makeFeatureVec3(words, embedding_dict, num_features)
        testDataVecs.append(np.array(testDataVec, dtype="float32"))
        
    trainDataVecs = pad_sequences(trainDataVecs, maxlen=512, padding='pre', dtype='float')
    testDataVecs = pad_sequences(testDataVecs, maxlen=512, padding='pre', dtype='float')
    
    word_model = get_word_model()
    word_model.fit(trainDataVecs, testDataVecs, batch_size=64, epochs=50, callbacks=[early_stopping])
    
    y_pred_word = word_model.predict(testDataVecs)

    # Round y_pred to the nearest integer.
    y_pred_word = np.round(y_pred_word)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values, y_pred_word, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(result))
    word_models.append(word_model)
    word_results.append(result)


--------Fold 0--------



MemoryError: Unable to allocate 1.09 GiB for an array with shape (1426, 512, 200) and data type float64

In [None]:
for cnt in range(5):

    y_pred = sentence_model.predict(testData[cnt])

    # Round y_pred to the nearest integer.
    y_pred = np.round(y_pred)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_testData[cnt].values, y_pred, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(result))
    results.append(result)

print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(results).mean(),decimals=4))

sentence_models[results.index(max(results))].save('./final_gru.h5')

In [115]:
X_set[0]['score'].iloc[0]

0.6

In [25]:
for traincv, testcv in cv.split(X):
    print(len(traincv), len(testcv))

10380 2596
10381 2595
10381 2595
10381 2595
10381 2595


## Training Phase - Sentence

문장 단위, 모델 돌리는 부분만 (전처리는 위에서)

In [None]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

sentence_models = []
sentence_results = []
for cnt in range(5):
    
    print("\n--------Fold {}--------\n".format(cnt))
    sentence_model = get_sentence_model()
    sentence_model.fit(trainData_sent[cnt], y_trainData_sent[cnt], batch_size=64, epochs=50, callbacks=[early_stopping])

    y_sent_pred = sentence_model.predict(testData_sent[cnt]) *100

    # Round y_pred to the nearest integer.
    y_sent_pred2 = np.round(y_sent_pred)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    sentence_result = cohen_kappa_score(y_testData_sent[cnt].values * 100, y_sent_pred2, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(sentence_result))
    sentence_results.append(sentence_result)
    sentence_models.append(sentence_model)

print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(sentence_results).mean(),decimals=4))

if np.round(np.array(sentence_results).mean(),decimals=4) > 0.75:
    sentence_models[sentence_results.index(max(sentence_results))].save('./final_gru_sent.h5')


--------Fold 0--------

Model: "sequential_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_62 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_63 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_39 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

In [70]:
for ix in trainData_sent:
    print(ix.mean())

0.0023928953
0.002394801
0.0023637584
0.0024018772
0.0023853893


에세이 프롬프트 별로 문장단위 전처리 + 모델 돌리기

In [136]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
sentence_models_set = []
sentence_results_set = []
for essay_set in range(8):
    cv2 = KFold(n_splits=5, shuffle=True)
    num_features = 200

    trainData_set = []
    testData_set = []
    Y_trainData_set = []
    Y_testData_set = []
    for traincv, testcv in cv2.split(X_set[essay_set]):
        print('##Preprocessing Started')
        
        X_train_set, X_test_set = X_set[essay_set]['essay'].iloc[traincv], X_set[essay_set]['essay'].iloc[testcv]
        Y_train_set, Y_test_set = X_set[essay_set]['score'].iloc[traincv], X_set[essay_set]['score'].iloc[testcv]

        Y_trainData_set.append(Y_train_set)
        Y_testData_set.append(Y_test_set)

        trainDataVecs = []
        testDataVecs = []

        for essay in X_train_set:
            # Obtaining all sentences from the training essays.
            sentences = essay_to_sentences(essay, remove_stopwords = True)
            trainDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
            trainDataVecs.append(np.array(trainDataVec, dtype="float32"))

        for essay in X_test_set:
            # Obtaining all sentences from the training essays.
            sentences = essay_to_sentences(essay, remove_stopwords = True)
            testDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
            testDataVecs.append(np.array(testDataVec, dtype="float32"))

        trainDataVecs = pad_sequences(trainDataVecs, maxlen=128, padding='pre', dtype='float')
        testDataVecs = pad_sequences(testDataVecs, maxlen=128, padding='pre', dtype='float')
        trainData_set.append(np.array(trainDataVecs, dtype="float32"))
        testData_set.append(np.array(testDataVecs, dtype="float32"))
        print(len(trainDataVecs))
        print(len(testDataVecs))
        break
    print(trainData_set[0][0])
    print(Y_trainData_set[0][:5])
    print(testData_set[0][0])
    print(Y_testData_set[0][:5])
    for cnt in range(5):
        print("\n--------Fold {}--------\n".format(cnt))
        sentence_model2 = get_sentence_model()
        sentence_model2.fit(trainData_set[cnt], Y_trainData_set[cnt], batch_size=64, epochs=5, callbacks=[early_stopping])

        sen_y_pred = sentence_model2.predict(testData_set[cnt])
        
        # Round y_pred to the nearest integer.
        sen_y_pred2 = np.round(sen_y_pred)
        # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
        result = cohen_kappa_score(Y_testData_set[cnt].values * 100, sen_y_pred2, weights='quadratic')
        print("Kappa Score", cnt, ": {}".format(result))
        sentence_models_set.append(sentence_model2)
        sentence_results_set.append(result)
    print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(sentence_results_set).mean(),decimals=4))

    sentence_models_set[sentence_results_set.index(max(sentence_results_set))].save('./final_gru_sentence_set.h5')
    print("Essay set {} model completed".format(essay_set))
    break

##Preprocessing Started
1426
357
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.24193178  0.22323759 -0.10006917 ... -0.04419891 -0.08458567
  -0.06903541]
 [ 0.34667253  0.28169873  0.00883875 ...  0.12814818  0.00921442
   0.18334246]
 [ 0.150635    0.09475499  0.358405   ... -0.21159     0.145789
   0.11418   ]]
0    0.6
1    0.7
2    0.5
3    0.8
5    0.6
Name: score, dtype: float64
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.22252835  0.2876859  -0.08694589 ...  0.2740456  -0.04930744
   0.25714445]
 [ 0.23199749  0.10862587  0.00685335 ...  0.08057071  0.07084671
   0.20439659]
 [ 0.2631534   0.34036717 -0.22766416 ... 

IndexError: list index out of range

In [133]:
tt_pred = sentence_model2.predict(testData_set[0]) * 100
tt_pred2 = np.round(tt_pred)
tt_result = cohen_kappa_score(Y_testData_set[0].values * 100, tt_pred2, weights='quadratic')

In [135]:
tt_result

0.5084459391277986

이 때만 해도 문제가 없었지...

In [170]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
sentence_models = []
results = []
for cnt in range(5):
    
    print("\n--------Fold {}--------\n".format(cnt))
    sentence_model = get_sentence_model()
    sentence_model.fit(trainData[cnt], y_trainData[cnt], batch_size=64, epochs=50, callbacks=[early_stopping])

    y_pred = sentence_model.predict(testData[cnt])

    # Round y_pred to the nearest integer.
    y_pred = np.round(y_pred)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_testData[cnt].values, y_pred, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(result))
    results.append(result)

print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(results).mean(),decimals=4))

sentence_models[results.index(max(results))].save('./final_gru.h5')


--------Fold 0--------

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_6 (GRU)                  (None, 128, 128)          126720    
_________________________________________________________________
gru_7 (GRU)                  (None, 64)                37248     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Kappa Score 0 : 0.0

--------Fold 1--------

Model: "sequential_6"
_______

Kappa Score 1 : 0.7615001829652579

--------Fold 2--------

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_10 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_11 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epo

Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Kappa Score 3 : 0.7672508889309195

--------Fold 4--------

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_14 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_15 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         


IndexError: list index out of range

In [181]:
sentence_model.save('./final_gru.h5')

In [161]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
sentence_model = get_sentence_model()
sentence_model.fit(trainData[0], y_trainData[0], batch_size=64, epochs=50, callbacks=[early_stopping])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_2 (GRU)                  (None, 128, 128)          126720    
_________________________________________________________________
gru_3 (GRU)                  (None, 64)                37248     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epo

<tensorflow.python.keras.callbacks.History at 0x1d9b23aebb0>

In [162]:
trainData[0][-1]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.05037698,  0.02560705, -0.02863991, ...,  0.20011078,
        -0.17437738,  0.115073  ],
       [ 0.11637801,  0.1543995 , -0.05205584, ...,  0.15015085,
        -0.22297533,  0.01466028],
       [ 0.26673853,  0.27861378,  0.17775764, ...,  0.10933428,
        -0.08046336,  0.09950728]], dtype=float32)

In [177]:
y_trainData[0]

2        50.0
3        80.0
4        60.0
6        80.0
7        80.0
         ... 
12966    17.0
12969    58.0
12970    50.0
12972    53.0
12973    67.0
Name: score, Length: 10380, dtype: float64

In [163]:
from sklearn.metrics import cohen_kappa_score

y_pred = sentence_model.predict(testData[0])
# Save any one of the 8 models.
#if count == 5:
#     sentence_model.save('./final_lstm.h5')

# Round y_pred to the nearest integer.
y_pred = np.round(y_pred)
# Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
result = cohen_kappa_score(y_testData[0].values, y_pred, weights='quadratic')
print("Kappa Score: {}".format(result))
#results.append(result)

#count += 1

Kappa Score: 0.7695417293933671


## Original Traning Phase

In [65]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

cv = KFold(n_splits=5, shuffle=True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        sentences += essay_to_sentences(essay, remove_stopwords = True)

    num_features = 200 
    
    model = embedding_dict
    
    # Generate training and testing data word vectors.
    clean_train_essays = []
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=5)
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 8 models.
    if count == 5:
         lstm_model.save('./final_lstm.h5')
            
    # Round y_pred to the nearest integer.
    y_pred = np.round(y_pred)
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1


--------Fold 1--------

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 1, 200)            320800    
_________________________________________________________________
lstm_7 (LSTM)                (None, 64)                67840     
_________________________________________________________________
dropout_17 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 388,705
Trainable params: 388,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Kappa Score: 0.0012404740275122617

--------Fold 2--------

Model: "sequential_18"
_________________________________________________________________
Layer (type)         

In [15]:
print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(results).mean(),decimals=4))

Average Kappa score after a 5-fold cross validation:  0.6427


In [16]:
import math
from gensim.test.utils import datapath

contentBad = """
    In “Let there be dark,” Paul Bogard talks about the importance of darkness.

Darkness is essential to humans. Bogard states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (Bogard 2). Here, Bogard talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy.

Animals also need darkness. Bogard states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles that come ashore to lay their eggs—and some are not, such as the bats that save American farmers billions in pest control and the moths that pollinate 80% of the world’s flora. Ecological light pollution is like the bulldozer of the night, wrecking habitat and disrupting ecosystems several billion years in the making. Simply put, without darkness, Earth’s ecology would collapse...” (Bogard 2). Here Bogard explains that animals, too, need darkness to survive.
""" 

contentGood = """
    In response to our world’s growing reliance on artificial light, writer Paul Bogard argues that natural darkness should be preserved in his article “Let There be dark”. He effectively builds his argument by using a personal anecdote, allusions to art and history, and rhetorical questions.

Bogard starts his article off by recounting a personal story – a summer spent on a Minnesota lake where there was “woods so dark that [his] hands disappeared before [his] eyes.” In telling this brief anecdote, Bogard challenges the audience to remember a time where they could fully amass themselves in natural darkness void of artificial light. By drawing in his readers with a personal encounter about night darkness, the author means to establish the potential for beauty, glamour, and awe-inspiring mystery that genuine darkness can possess. He builds his argument for the preservation of natural darkness by reminiscing for his readers a first-hand encounter that proves the “irreplaceable value of darkness.” This anecdote provides a baseline of sorts for readers to find credence with the author’s claims.

Bogard’s argument is also furthered by his use of allusion to art – Van Gogh’s “Starry Night” – and modern history – Paris’ reputation as “The City of Light”. By first referencing “Starry Night”, a painting generally considered to be undoubtedly beautiful, Bogard establishes that the natural magnificence of stars in a dark sky is definite. A world absent of excess artificial light could potentially hold the key to a grand, glorious night sky like Van Gogh’s according to the writer. This urges the readers to weigh the disadvantages of our world consumed by unnatural, vapid lighting. Furthermore, Bogard’s alludes to Paris as “the famed ‘city of light’”. He then goes on to state how Paris has taken steps to exercise more sustainable lighting practices. By doing this, Bogard creates a dichotomy between Paris’ traditionally alluded-to name and the reality of what Paris is becoming – no longer “the city of light”, but moreso “the city of light…before 2 AM”. This furthers his line of argumentation because it shows how steps can be and are being taken to preserve natural darkness. It shows that even a city that is literally famous for being constantly lit can practically address light pollution in a manner that preserves the beauty of both the city itself and the universe as a whole.

Finally, Bogard makes subtle yet efficient use of rhetorical questioning to persuade his audience that natural darkness preservation is essential. He asks the readers to consider “what the vision of the night sky might inspire in each of us, in our children or grandchildren?” in a way that brutally plays to each of our emotions. By asking this question, Bogard draws out heartfelt ponderance from his readers about the affecting power of an untainted night sky. This rhetorical question tugs at the readers’ heartstrings; while the reader may have seen an unobscured night skyline before, the possibility that their child or grandchild will never get the chance sways them to see as Bogard sees. This strategy is definitively an appeal to pathos, forcing the audience to directly face an emotionally-charged inquiry that will surely spur some kind of response. By doing this, Bogard develops his argument, adding gutthral power to the idea that the issue of maintaining natural darkness is relevant and multifaceted.

Writing as a reaction to his disappointment that artificial light has largely permeated the prescence of natural darkness, Paul Bogard argues that we must preserve true, unaffected darkness. He builds this claim by making use of a personal anecdote, allusions, and rhetorical questioning.
"""

def testContent(content):
    if len(content) > 20:
        num_features = 200
        clean_test_essays = []
        clean_test_essays.append(essay_to_wordlist( content, remove_stopwords=True ))
        testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
        testDataVecs = np.array(testDataVecs)
        testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

        preds = lstm_model.predict(testDataVecs)

        if math.isnan(preds):
            preds = 0
        else:
            preds = np.round(preds)

        if preds < 0:
            preds = 0
    else:
        preds = 0

    return preds
    
print("the SAT 1 score essay scored", testContent(contentBad))
print("the SAT 4 score essay scored", testContent(contentGood))

the SAT 1 score essay scored [[63.]]
the SAT 4 score essay scored [[78.]]


In [17]:
import pickle

# Pickle glove embeddings
with open('embeddings.pickle', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)