## Importing the Data

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Constants
DATASET_DIR = './data/'
SAVE_DIR = './'

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel.tsv'), sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [3]:
X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


Minimum and Maximum Scores for each essay set.

In [4]:
minimum_scores = np.array([-1, 2, 1, 0, 0, 0, 0, 0, 0])
maximum_scores = np.array([-1, 12, 6, 3, 3, 4, 4, 30, 60])

In [5]:
old_min = minimum_scores[X['essay_set']]
old_max = maximum_scores[X['essay_set']]
old_range = old_max - old_min
new_min = 0
new_max = 100
new_range = (new_max - new_min)  
X['score'] = (((X['domain1_score'] - old_min) * new_range) / old_range) + new_min

# round score to nearest integer for cohen kappa calculation
y = np.round(X['score'])

X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,score
0,1,1,"Dear local newspaper, I think effects computer...",8,60.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,70.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,50.0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,80.0
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,60.0


In [6]:
X.shape

(12976, 5)

In [7]:
X_set = []
for cnt in range(8):
    is_cnt = X['essay_set'] == (cnt+1)
    set_cnt = X[is_cnt]
    X_set.append(set_cnt)

In [8]:
X_set[0].shape

(1783, 5)

## Preprocessing the Data

We will preprocess all essays and convert them to feature vectors so that they can be fed into the RNN.

These are all helper functions used to clean the essays.

In [9]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tmax\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tmax\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
import numpy as np
import re
from nltk.corpus import stopwords

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    for word in words:
        if word in model:
            num_words += 1
            featureVec = np.add(featureVec, model[word])       
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def makeFeatureVec2(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    for word in words:
        if word in model:
            featureVec = np.add(featureVec, model[word])
    if len(words) != 0:
        featureVec = np.divide(featureVec,float(len(words)))
    return featureVec

def makeFeatureVec3(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = []
    for word in words:
        if word in model:
            featureVec.append(np.array(model[word], dtype="float32"))
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for glove model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

def getAvgFeatureVecs2(essay, model, num_features):
    """Main function to generate the word vectors for glove model."""
    essayFeatureVecs = np.zeros((len(essay),num_features),dtype="float32")
    for cnt, sentence in enumerate(essay):
        essayFeatureVecs[cnt] = makeFeatureVec2(sentence, model, num_features)
    return essayFeatureVecs

In [11]:
X

Unnamed: 0,essay_id,essay_set,essay,domain1_score,score
0,1,1,"Dear local newspaper, I think effects computer...",8,60.000000
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,70.000000
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,50.000000
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,80.000000
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,60.000000
...,...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,35,58.333333
12972,21628,8,I never understood the meaning laughter is th...,32,53.333333
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40,66.666667
12974,21630,8,Trippin' on fen...,40,66.666667


In [12]:
# load glove embeddings
embedding_dict={}
with open(os.path.join(DATASET_DIR, 'glove/glove.6B.200d.txt'),'r', encoding='UTF8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors


Now we train the model on the dataset.

We will use 5-Fold Cross Validation and measure the Quadratic Weighted Kappa for each fold.
We will then calculate Average Kappa for all the folds.

In [26]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True)
num_features = 200

trainData_sent = []
testData_sent = []
y_trainData_sent = []
y_testData_sent = []
for traincv, testcv in cv.split(X):
    print('##Fold Started')
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    y_trainData_sent.append(y_train)
    y_testData_sent.append(y_test)
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']

    trainDataVecs = []
    testDataVecs = []
    
    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        sentences = essay_to_sentences(essay, remove_stopwords = True)
        trainDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
        trainDataVecs.append(np.array(trainDataVec, dtype="float32"))

    for essay in test_essays:
        # Obtaining all sentences from the training essays.
        sentences = essay_to_sentences(essay, remove_stopwords = True)
        testDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
        testDataVecs.append(np.array(testDataVec, dtype="float32"))
        
    trainDataVecs = pad_sequences(trainDataVecs, maxlen=128, padding='pre', dtype='float')
    testDataVecs = pad_sequences(testDataVecs, maxlen=128, padding='pre', dtype='float')
    trainData_sent.append(np.array(trainDataVecs, dtype="float32"))
    testData_sent.append(np.array(testDataVecs, dtype="float32"))
    print(len(trainDataVecs))
    print(len(testDataVecs))

##Fold Started
10380
2596
##Fold Started
10381
2595
##Fold Started
10381
2595
##Fold Started
10381
2595
##Fold Started
10381
2595


## Defining the model 

Here we define a 2-Layer LSTM Model. 

Note that instead of using sigmoid activation in the output layer we will use
Relu since we are not normalising training labels.

In [66]:
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Lambda, Flatten
from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(200, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 200], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

def get_sentence_model():
    """Define the model."""
    model = Sequential()
    model.add(GRU(128, dropout=0.1, recurrent_dropout=0.1, input_shape=[128, 200], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.1))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

def get_word_model():
    """Define the model."""
    model = Sequential()
    model.add(GRU(128, dropout=0.4, recurrent_dropout=0.4, input_shape=[512, 200], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [36]:
trainData_sent[0][0].shape

(128, 200)

In [54]:
for ix, dx in enumerate(trainData_sent):
    for iy, dy in enumerate(dx):
        if dy.mean() < 0.0000001:
            print (ix, iy)

0 3218
0 5179
0 8603
0 8800
0 9034
0 9395
1 3930
1 5121
1 8764
1 8999
1 9385
1 9669
2 3180
2 3900
2 5130
2 8601
2 8800
2 8805
2 9677
3 3262
3 3970
3 5199
3 8594
3 8793
3 8796
3 9035
3 9407
3 9688
4 3208
4 3897
4 8603
4 8803
4 8807
4 9028
4 9403
4 9689


In [63]:
tttt = X['essay']
for ix, essay in enumerate(tttt):
    # Obtaining all sentences from the training essays.
    sentences = essay_to_sentences(essay, remove_stopwords = True)
    if len(sentences) < 3:
        print(ix)
        print(X['essay'][ix])

21
Dear local Newspaper @CAPS1 a take all your computer and given to the people around the world for the can stay in their houses chating with their family and friend. Computers help people around the world to connect with other people computer help kids do their homework and look up staff that happen around the world.
124
Computers dont have any affect on kids we just love going on cause we use it for help and this persuade the readers of the local newspaper cause we need to be able to communicate also do writing essays and doing social studies or science homework my ideas are let us go computers cause were not bothering u can just leave us alone and let us do what you need to do cause what computers are what give us information for we have to do and were to do wat we gotta do and u people can just leave us alone cause arent addicting to me or anyone and if we were it still would it matter cause a computers a computer u dont punish it because just punish us from the computer punish u

2468
I dont think there should be any censorship good books and music shouldint be removed just becaus one person finds it offensive. Becaus the opinion's of one @MONTH1 be diffrent with another.I beleave music is cencord the most becaus  not many people get offendid by books but censorship in music i think is destroying it.Becaus its an artists way of expresion and if people dont wana listen to what there saying then they should just not listen to it.Books i think shouldint be taken offensivly forthe same reason music shouldint its a writers oppinion and expresion its like telling someone they cant voice there oppinion on anything like racesim or use curse words if they want.
2478
At my point of viwe i think if materials such as books is offensive to the reader the reader shuld just leve the book allown.Because other readers might just wont to read it.Same thing for music,movies,and magazines, or any thing else just because you fide some offensive it wont be right to take it away from

3603
The features of the setting that affect the cyclist are the lack of water, out of date information, and rolling hills. If he had up to date information then he would probably have got water from a town thats not empty or almost empty like a ghost town as the cyclist said.
3604
The cyclist has a couple things if he racing and the wind is blowing then theres a problem and he has to follow traffic law and ride on the right side or the road.
3609
Well I would put it like this is that the weather had everything to do with it because the not weather make you tired and upset and easy to angre so now it make you upset and and then stress.
3615
In the essay the mshan writes of rough rodes timit the cyclists had to overcome to reach @CAPS1 gole.
3617
The setting of the story affected the cyclist by making him dehydrated which could lead to a heatstroke. I could drop from a heatstroke this shows that it was hot out and he was feeling bad if he could have dropped from a heatstroke.
3629

3936
The features of the setting affect the cyclist in many ways. It made him tired thirsty and he was near exaustion.
3939
In the story Do Not Exceed Posted Speed Limitmany features of the setting challenge the cyclists.The cyclist accepts information from foreign old people which made his first mistake.After realizing this he,I had been hitting my water bottles pretty regularly,and i was traveling through the high deserts of California,in Junethe lack of water plus the heat in California,in june would have turned to heatstroke for most people.Even further into the journey,even with all those things against him it states,flat rode was replaced by short,rolling hills,"@CAPS1 not only did he have excessive heat,and no water,but @CAPS1 he was handed troublesome rodes too.Although the cyclist must have been near to complete exhaustion,he continued through.Fighting all adds,and elements,many features in the setting of this story affected the cyclist,but he overcame them honorably.
396

4305
The setting effects the rider because if its hot you get thirsty and you have no water.This kind of question is hard to answer
4313
The cyclist thought it was hard. Also he was upset that the old timers gave bad directions.
4318
The setting effects the cyclist very much. By the very dry climates
4332
The features of the setting affected the cyclist because where they were.It was like in te middle of nowhere and every person they asked are having all times.They go the wrong directions to the nearest town.The old guy told him it was @NUM1 miles down the road when he went @NUM2 miles to the next town and said to him self he will never take directions from a gut that has all timers and hasnt left the confines of their porches since carter was in office.
4334
I think that the setting made him find more courage inside him self to save his water and go to the next town with out any water.
4345
In  the   cyclist   story  the  setting   took  a  huge  roll on  how  the  story    thined  

4639
The setting affected the cyclist because of the bad condition roads. The was also hills so they made it harder for the cyclists.
4648
The features of the setting in this story gives the cyclist determination. Since the desert
4669
The features of the setting affected the cyclist by making his way to the Yosemite National Park a longer and more difficult and dangerous way to go. He takes the awee of going that way by some old folks that he says havent gotten off there porches since carter was in office. I think tha tthere would be a reason to not listen to them though they @MONTH1 have wisdom.
4679
The setting affects the cyclist because he is faced with roadways he is unsure of the weather is hot and dry and there are wild animals. The text states,
4683
The feature of the setting affected the cyclist because at first he had to watch out for the snakes that usually crossed the roads, then the streets were crooked and bumpy, so the road structure throw him off a bit, and he was r

4984
The features of the setting affected the cyclist because he saw short, rolling hills, tumbleweeds and a ridiculously large shake, and he thought that he was not getting anywhere.
4991
I think that the features of the setting affected the cyclist because of the sharp turns and the lonelyness it say in the text About forty miles into the pedal. I arrived at the first town.
4999
In the towns that the cyclist arrived at were all ghost towns. In the text, the author said, it fit the traditional definition of a ghost town. It affected the cyclist because, with the towns being deserted, the cyclist couldnt refill hi water.
5000
The cyclist was motivated to get the job done. When they   said only     eighteen miles down the road. He thought it was going to be forever but he made it because he conserved his water ... and last kept on going, when the road   turned into a rocky gravel road, he still kept on going, but it was harder on his legs and water supply.
5023
My point of

5353
The author concludes the story with this paragraph because the author wrote the same thing that it had in the paragraph and he wrote it in his story but added a little more details to it and problems and things.
5359
I think the author concludes the story with the paragraph because Saeng is saying when the good time came back around she will try for the test again because She had failed it. for example on page @NUM1 at the top She " I-I falled the test" thats why i think the author concludes this with the Story because it was important to Saeng to past the test in the good time of season Spring. 
5363
The author concludes the story with that statement because it shows Saeng passion for gardening. How shes willing to take the test is to see if her plants can grow when spring comes around and the hibiscus starts to bud she will attempt to grow it which shows how anxious Saeng is for gardening and to try and get the hibiscus to grow during the spring.
5366
I think the author wanted 

5536
The author ends the story with this paragraph show is that Saeng is willing to try again to try to get a plant that will remind her of her grand mother and  remind her of her old home where she @MONTH1 go back one day . So, she can see the near flower she loves and not the take   one.
5538
What l think the author was talking abount snow and it was add outside for the bays matter and snow started to melted.
5540
Saeng says this in the paragraph because she wants to make her mother proud of her so she is going to take the test again after winter.
5547
In the short story Winter hibiscus The author concludes the story with this paragraph because the author felt as thought Saeng would be ready to take thee test at that particular time.
5553
The author concludes the story with this paragraph to show that while this particular tale is over Saengs life will continue. The paragraph provides closure for the reader as well as explains that in time Saeng will take the test again, once she 

5717
They author concludes the story with this paragraph because he want to see what people would say.
5719
In the story I think the author concludes  that  paragraph  with  the  story  because  Saeng  is  saying  that  when  she  comes  back  or when  she next  sees  the  hibiscus  bloom  again  she  will  then  take  the  test  but  until then  she  feels  out  of  place  and  she  is no use to the  arrangements  she is living in  now  and  she  feels  uncomforatable  but  until  then  she  thinks  that she will not  be able  to do  will  until  she  is in  her  bloms  tow  a  where  it  is  horman   for  her.
5720
The author concludes with that paragraph because show that she will go back to the one place and will not break down like that and grif over what she has lost and have to deal with change.
5741
The authors concludes the passage with this paragraph phrase to show that after all of that hear tacke and confusion she knows it will take time to adjust and she realizeses that sh

5915
The author concludes this paragraph because he wants to make it more clearer so the readers would understand It more. And so he could, make it better to stand out. 
5918
I think the author concludes the story with this paragraph When they come back, Saeng vowed silently to herself, iN the spring, wheN the @CAPS1 melt and the geese RetuRN and this hibiscus is budding, then I will take that test again means to the author kinda like the meaning to life. That if you mess up its okay just try again and do better.
5921
The author concludes the story with this paragraph because he/she is trying to put across the message that the plant shows a symbol of new beginings which is a symbol of saengs vowe. Also in the story caring for the plant teaches saeng responsablity in obtaining her license by care for the plant it will teach her patients and that she needs to learn from mistakes made.
5924
I think that the concluding paragraph means after everything   else   important and what is mean

6112
when they come back Saeng vowed silently to herself In the spring when the snows melts and the geese return and this hibicus budding then I will take that test again
6119
He author concludes this paragraph in this story because It sounds good and to show the people he is writing it to that no matter what happend s to you in life you should Just keep on doing what you think is best for you. Just like the flower she keeps coming back for more after going through all thats tuff. 
6121
The author concludes the story with this paragraph because it shows that Saeng is starting to take responsibility for her school work now. For example in the story she made it obvious she cared more about the garden then school.
6122
Saeng was talking about testing the plants seeing if the geese liked where she planted them. I know this because she mensioned geese in the last paragraph and in the geese said yes| when she planted the hibiscus.
6124
I think the author concludes the story with that parag

6297
The author concludes the story with this paragraph because he wants to end it on a good note and send a good message there not going to quit because they failed the first time.
6300
The author concludes the story with this paragraph because this is what the story was all about. Saeng saw what happens in the winter and when it comes back around she trys to memorize it so that she can no what will happen.
6301
The author ends the story this way because by the time winter ends she will have adapted to her new society, just like the hibiscus.
6305
The reason why the author used this paragraph is because he was sag that the snow go away in spring time that all.
6309
She concluded the paragraph with that paragraph to show that she is serious about her gardening and that she wont garden until geese come back.
6312
She is saying here that she will start a new leaf when spring comes around she is going to start a new leaf and she is going to do better on her driving test. I think she conc

6494
the author concludes the story with this paragraph, because it tells that if you fail the first their is always a second time just like the @CAPS1 I dies in the winter and it come back alive in the spring.
6498
The author chose to conclude the story like that because sheâs saying that it isnât over and sheâs going to take the test again. In the story Saeng says âI- I failed the testâ so shes determined to go back when the flower blooms again to get things right.
6499
I think what the author is trying to say is that she will take her driving test again in the spring when the flowers are out because it will make her feel more relaxed. The flowers put the author in a relaxed state of mind where she can focus.
6500
Because she saying when the @CAPS1 grow back she will be @CAPS2 to take the test again.
6504
The author concludes the story with  the last paragraph of the story. When they come back, saeng vowed silently to herself, in the spring, when the snows melt and the gees

6672
I believe the author concluded the story with this paragraph because in the story, the author says through the quiet repitition of season upon season which means that seasons will come a seasons will go but shell always be there, ready to take the test
6673
The author is probably trying to say in the spring when the seasons change and the geese return and the hibiscus is budding he will take the test again.
6674
The author concludes that failing the test is not the worst thing. She relizes that times change and you can always get another chance, which is explaned in the last paragh that once the hibiscus buds she will take the test again.
6675
The author conclude the story with her saying in the spring, when the snows melt and the geese return and this hibiscus is budding then I will take that test again. to say the she will not gust out like that.
6679
The author concludes the story with this paragraph or quote to get the readers something to think about at the end of the re

6877
The author concludes the story with this paragraph for a couple of reasons.
6878
Saeng is @CAPS1 that it Will be a longtime before she will worn about school work again. thats Why the author concluds the story with their paragraph.
6879
The author concludes the story with that paragraph, because he wants the emphasize that the girl is not completly dissatisified with leaving her home. He ended with that because he wanted to she that their is hope for her to learn to love her new home.
6882
Well, I think the author concludes the story with that sentence because maybe Saeng likes the spring and can concentrate better with her favorite flower that she thinks relates to her life.
6883
The author ends the story like that to show the theme working hard, Some other examples of working hard are the winter hibiscus." Another example is her vow to pass the test, so that is why the author ends the story that way.
6888
I think the author concludes this story wit this ending paragraph because

7057
the author concludes the story with that paragraph because it sounds like a interesiting ending paragraph that have something to do with the story
7060
The author concludes the story with that paragraph because Saeng is going to start over and do better the second time around because she doesnt want to disappoint herself. That is why the author concludes the story with this paragraph.
7065
The reason the author chose this paragraph is because the paragraph means that when the snow melt and all the geese returned, the flowers @CAPS1. That one flower she had to @CAPS2 tests on.
7068
The author concludes the story with that paragraph so the reader has something to think about.When the author concludes with something like that he/she wants the reader to make there own ending or pridicshon. And some times it means a sequel.
7069
The author concludes the story with this paragraph because it makes the story a happy ending how she doesnt give up and is taking the test again. Also she do

7541
The mood created by the author in the memoir is a mood of happiness and thankfulness. In the memoir the author talks about how greatful he is to his parents and how happy he was when he lived with them.
7558
The mood created by the author, @PERSON1, is a upbeat & happy mood. I know this because he states that he is very grateful for his parents to pass down the cooking recipes and skills, and the Cuban music that they would listen to.This is the mood @PERSON1 creates in the memoir.
7563
The mood the other Created was how happy narciso was because her parents gave up so much for her to have a better life and education in america.
7568
The mood created by the author was very greatful and outgoing.The person was very greatful to grow up and be able to live in @LOCATION1. He loved how his parents gave up everything and were selfless so their kids could live normal lives and do what they wanted to do.The kids parents were very outgoing because they gave up everything they had in their 

7986
The mood created by the author is that he is realy happy that his parents moved to america to live close to friends and family because in paragraph @NUM1 it says" countless extended family members came and went -- and there was alway someone temporarily staying with us."
8010
The mood is grateful, because it shows how grateful he is for his house and all the people that come over and be his family.
8026
The @CAPS1 @CAPS2 in the story is happy This is because he got to move to the @LOCATION1 and be born hear for a better life. He Also loved growing up hear with the cuban traditions and music as stated on paragraph @NUM1 "And for what I am eternally grateful," wich I adore to this day."
8040
The mood the author created in the memoir was emotional because he describes his childhood and how he grew up.
8044
I think the mood was happy @CAPS2 the @CAPS1 talk about a @PERSON2 named narciso Rodriguez and that he lived in Cuba and he's parents moved to the United States so the family could

8426
The mood created by the @CAPS1 is a very good and happy mood. You can tell by how he is always talk about both happy, family, and his cuba harrtage and works, music.
8453
The mood of the auther might have been in a good mood. the auther might have been doing some of us own family, history and saw that his family did the same things as what they did in the story.
8480
the author in the memoir's mood is like saying it's good and bad because he talks like he and his family take in people that need help geting back on thare feet, there hopeing that the people will do the same for them if thay need it.the boy is happy he lives in the @LOCATION1 but his parents gave up so much for him. this is the mood of the author.
8490
The mood created in the memoir was peacefulness, @CAPS1 and gratefulness because Narciso Rodriguez is grateful for hisparentsshowinghim what family really means, and he says his whole neighborhood came together in @CAPS1 to createpeace.
8491
In the memoir, The author m

8855
It creates a mood of happiness "but Cuban music filled the air as it mixed with the aromas of the kitchen." this shows what his home was like
8875
The mood created by the author in the memoir is happiness and grateful because the author is grateful of his parents for having a roof over his head, cooking food for him, and for having such courageous parents. That's the mood of the author in the memoir.
8880
The mood was created by the author in the memoir and specic imformationI @CAPS1 the mood is like something in the sky that's why I @CAPS1 about the mood.
8924
Based on the story it all started when they had build the empire state building and the workers should have knowned that the building was too big for the wind climate, up in the sky and plus when you make a building like that you have to make sure it's strong enough to keep balance.                         They faced problem's like when the U.S. tried to make a deal with them if they would let them use the top of the buildi

9873
They could not put a thousand - foot dirigible moored at the top of the building held by a single cable tether, it would add stress to the building's frame.
9883
In the reading comprehension "The Mooring Mast" by Marcia Amidon Lüsted, the obstacles the builders of the Empire State Building faced in attempting to allow dirigibles to dock because they wanted to add the mooring mast on the top of the Empire State Building's flat roof and the dirigibles would be able to take the mooring mast up to the top of the building. The architects found out that if they put the mooring mast on the top of the building its gonna bring stress to the building's foundation.
9900
one of the obstacles that had to face was they had to make it taller because it was sapoosed to be the tallest building. They also had problems with it so they had to keep fixing it.
9925
Based on the excerpt, the obstacles the builders of the empire state building faced attempting to allow derigibles to dock there are the st

10473
The builders of the Empire State built two floors just for the passengers to drop off their luggage. They had a view in those floors, and was the idea didn't work, they wasted money on those extra rooms.They started working on the build and attempt to make dirigibles dock on the top, but they wasted alot of time and money, and it was just a waste altogether.
10491
The obstacles the builders of the Empire State Building faced in attempting to allow dirigibles to dock there was a existing law. In paragraph @NUM1 saying the reason why dirigibles could not moor at the empire state Building cause of an existing law against airship flying too low under urban area and the law would make it illegal for a ship to even tie or even approach the mooring mast.
10522
The obstaclesthat the builders of the Empire state building faced were those of: 1) it was at a height of @NUM1 feet, @NUM2) how would the passengers be able to get on and off, @NUM3) takes a long time to slow down and @NUM4) the 

One day when I was going to @LOCATION1  we drove for @NUM1 whole day because we went in the wrong direaction and my mom got made at night we were trying to find a hotel about @NUM1:30am I was the only one aware because my three brothers @PERSON1, @LOCATION2, caman were sleping in the back of the van my sister @CAPS1 and me were in the front and I got really angry because a guy. Look the last room they had but I was really angry patient and really we a room the next day we went to the @ORGANIZATION1 to stay we were going to the water park/musment park there was a new ride" the scoping tail butt we had to wait for @NUM1 hour to go on that ride and I was really patient but my mom was getting tired and mad because  it took so long then the next day it rained and rained so we went to the indoor fool at our hotel but we couldnt go in because there were in many people so we had to wait but I wasnt patient because I wanted to go swimming  the almost a halve hour we got to go swimming then the 

11255
One time I was patient when I let a stranger cut infront of me to get some food. And when me and my brother both had to use the bath room I let him go first even though I was first in line.
11266
I am the type of person who you would call inpaient but I will try to explan it in the best way I can the only time I am patient is when something existing on tv comes on my frinds are always paient for everything @CAPS1 @CAPS2 and other things but I am never paient
11274
Patience is sometimes hard to do if you are already exsited. But some time you are doing like if one of your family members is talking & you are waiting for something else that is patience
11276
One time me and my dad were at a store.There was a line about @NUM1 people my dad and I were starting to get impatient so I told myself to calm down and wait. But I was still not calming down then the very first person left and then was starting to calm down and just wait for the next person to go.
11286
When I was patient I was

11822
My story in own way about being patience is when there is like @NUM1 people in front of a line a ceter point. You have to be patient in the lines because other were before you so you have to wait your turn in line before you go because thats called cutting and you will get kicked out of places if u do that you dont want that to happen because like say that you want something to eat you have wait your turn in line so you dont get kick out of the rides you have to wait your turn so you dont get the secrety on you.
11852
One day I had a doctors appointment and I had to check- in at the desk and after I did that I had to sit and be patient till the doctors called me back to get my check up done  and I to wait till the real doctor came to see how I was feeling and to see if I was eating good to and then I had to go to the check out desk to see what was my next check up was and then I had to get my pills refilled and then I went home.
11853
One time I was pacient was when my mum sai

12154
The time iwas patent was when I was in schol I was in math and I waswoting to setant so i could go to my friends @ORGANIZATION1 so I was waiting and whatins and whatins and watins and whatins @CAPS1 the clock was not tennin so starten doins my class work and @CAPS2 the @ORGANIZATION2 rang I was @CAPS3 agen @CAPS1 thare is one @CAPS5 @CAPS1 it was not a @CAPS7 @CAPS8 cons we hade the meap so it was only a @CAPS9 @CAPS10 in all classes so we only @CAPS11 @NUM1 minits left till the time moms out so i @CAPS12  some want and @NUM2 minits past and then me and m fend. Thats good @NUM2 minits to so @NUM4 minits @NUM5 minits @NUM6 @NUM7 @NUM8 @NUM9 @NUM10 @NUM11 @NUM12 @NUM12 left now @NUM1 secents rims thens gos the @ORGANIZATION2 and that what @CAPS13 when was esent
12162
One time my friend was waitin for I stay in there a hole hour I was takin my time he wait though he couldnt to I got out the bathroom I said thankz he said anytime I was surprised at her.
12169
I was patience the whol

## Training Phase - Word

GPU가 생기면 돌릴 코드...data 단어 단위로 쪼개기 + 모델 돌리기

In [229]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

cv3 = KFold(n_splits=5, shuffle=True)
num_features = 200

word_model_cnt = 0
early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
word_models = []
word_results = []
for traincv, testcv in cv3.split(X):
    print("\n--------Fold {}--------\n".format(word_model_cnt))
    
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]

    
    train_essays = X_train['essay']
    test_essays = X_test['essay']

    trainDataVecs = []
    testDataVecs = []
    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        words = essay_to_wordlist(essay, remove_stopwords=True)
        trainDataVec = makeFeatureVec3(words, embedding_dict, num_features)
        trainDataVecs.append(np.array(trainDataVec, dtype="float32"))

    for essay in test_essays:
        # Obtaining all sentences from the training essays.
        words = essay_to_wordlist(essay, remove_stopwords=True)
        testDataVec = makeFeatureVec3(words, embedding_dict, num_features)
        testDataVecs.append(np.array(testDataVec, dtype="float32"))
        
    trainDataVecs = pad_sequences(trainDataVecs, maxlen=512, padding='pre', dtype='float')
    testDataVecs = pad_sequences(testDataVecs, maxlen=512, padding='pre', dtype='float')
    
    word_model = get_word_model()
    word_model.fit(trainDataVecs, testDataVecs, batch_size=64, epochs=50, callbacks=[early_stopping])
    
    y_pred_word = word_model.predict(testDataVecs)

    # Round y_pred to the nearest integer.
    y_pred_word = np.round(y_pred_word)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values, y_pred_word, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(result))
    word_models.append(word_model)
    word_results.append(result)


--------Fold 0--------



MemoryError: Unable to allocate 1.09 GiB for an array with shape (1426, 512, 200) and data type float64

In [None]:
for cnt in range(5):

    y_pred = sentence_model.predict(testData[cnt])

    # Round y_pred to the nearest integer.
    y_pred = np.round(y_pred)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_testData[cnt].values, y_pred, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(result))
    results.append(result)

print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(results).mean(),decimals=4))

sentence_models[results.index(max(results))].save('./final_gru.h5')

In [239]:
X_set[0]['score'].iloc[0]

60.0

In [25]:
for traincv, testcv in cv.split(X):
    print(len(traincv), len(testcv))

10380 2596
10381 2595
10381 2595
10381 2595
10381 2595


## Training Phase - Sentence

문장 단위, 모델 돌리는 부분만 (전처리는 위에서)

In [67]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

sentence_models = []
sentence_results = []
for cnt in range(5):
    
    print("\n--------Fold {}--------\n".format(cnt))
    sentence_model = get_sentence_model()
    sentence_model.fit(trainData_sent[cnt], y_trainData_sent[cnt], batch_size=64, epochs=5)

    y_sent_pred = sentence_model.predict(testData_sent[cnt])

    # Round y_pred to the nearest integer.
    y_sent_pred = np.round(y_sent_pred)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    sentence_result = cohen_kappa_score(y_testData_sent[cnt].values, y_sent_pred, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(sentence_result))
    sentence_results.append(sentence_result)
    sentence_models.append(sentence_model)

print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(sentence_results).mean(),decimals=4))

if np.round(np.array(sentence_results).mean(),decimals=4) > 0.75:
    sentence_models[sentence_results.index(max(sentence_results))].save('./final_gru_sent.h5')


--------Fold 0--------

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_28 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_29 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_22 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Kappa Score 0 : 0.0

--------Fold 1--------

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output 

에세이 프롬프트 별로 문장단위 전처리 + 모델 돌리기

In [44]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
sentence_models_set = []
sentence_results_set = []
for essay_set in range(8):
    cv2 = KFold(n_splits=5, shuffle=True)
    num_features = 200

    trainData_set = []
    testData_set = []
    Y_trainData_set = []
    Y_testData_set = []
    for traincv, testcv in cv2.split(X_set[essay_set]):
        print('##Preprocessing Started')
        
        X_train_set, X_test_set = X_set[essay_set]['essay'].iloc[traincv], X_set[essay_set]['essay'].iloc[testcv]
        Y_train_set, Y_test_set = X_set[essay_set]['score'].iloc[traincv], X_set[essay_set]['score'].iloc[testcv]

        Y_trainData_set.append(Y_train_set)
        Y_testData_set.append(Y_test_set)

        trainDataVecs = []
        testDataVecs = []

        for essay in X_train_set:
            # Obtaining all sentences from the training essays.
            sentences = essay_to_sentences(essay, remove_stopwords = True)
            trainDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
            trainDataVecs.append(np.array(trainDataVec, dtype="float32"))

        for essay in X_test_set:
            # Obtaining all sentences from the training essays.
            sentences = essay_to_sentences(essay, remove_stopwords = True)
            testDataVec = getAvgFeatureVecs2(sentences, embedding_dict, num_features)
            testDataVecs.append(np.array(testDataVec, dtype="float32"))

        trainDataVecs = pad_sequences(trainDataVecs, maxlen=128, padding='pre', dtype='float')
        testDataVecs = pad_sequences(testDataVecs, maxlen=128, padding='pre', dtype='float')
        trainData_set.append(np.array(trainDataVecs, dtype="float32"))
        testData_set.append(np.array(testDataVecs, dtype="float32"))
        print(len(trainDataVecs))
        print(len(testDataVecs))
    print(trainData_set[0][0])
    print(Y_trainData_set[0][:5])
    print(testData_set[0][0])
    print(Y_testData_set[0][:5])
    for cnt in range(5):
        print("\n--------Fold {}--------\n".format(cnt))
        sentence_model2 = get_sentence_model()
        sentence_model2.fit(trainData_set[cnt], Y_trainData_set[cnt], batch_size=64, epochs=50, callbacks=[early_stopping])

        y_pred = sentence_model2.predict(testData_set[cnt])
        
        # Round y_pred to the nearest integer.
        y_pred = np.round(y_pred)
        # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
        result = cohen_kappa_score(Y_testData_set[cnt].values, y_pred, weights='quadratic')
        print("Kappa Score", cnt, ": {}".format(result))
        sentence_models_set.append(sentence_model2)
        sentence_results_set.append(result)
    print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(sentence_results_set).mean(),decimals=4))

    sentence_models_set[sentence_results_set.index(max(sentence_results_set))].save('./final_gru_sentence_set.h5')
    print("Essay set {} model completed".format(essay_set))
    break

##Preprocessing Started
1426
357
##Preprocessing Started
1426
357
##Preprocessing Started
1426
357
##Preprocessing Started
1427
356
##Preprocessing Started
1427
356
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.14205515  0.2916316   0.005404   ...  0.08443499  0.10570675
   0.09313588]
 [ 0.17094718  0.18547271  0.04998897 ...  0.15472499 -0.05646118
   0.17926295]
 [ 0.2557258   0.26412618  0.02941292 ...  0.16695438 -0.07444575
   0.14252858]]
1    70.0
2    50.0
3    80.0
4    60.0
5    60.0
Name: score, dtype: float64
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.24193178  0.22323759 -0.10006917 ... -0.04419891 -0.08458567

Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Kappa Score 1 : 0.0

--------Fold 2--------

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_18 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_19 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)                0         
__

Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Kappa Score 2 : 0.0

--------Fold 3--------

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_20 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_21 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_13 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epo

Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Kappa Score 4 : 0.0
Average Kappa score after a 5-fold cross validation:  0.1397
Essay set 0 model completed


이 때만 해도 문제가 없었지...

In [170]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score

early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
sentence_models = []
results = []
for cnt in range(5):
    
    print("\n--------Fold {}--------\n".format(cnt))
    sentence_model = get_sentence_model()
    sentence_model.fit(trainData[cnt], y_trainData[cnt], batch_size=64, epochs=50, callbacks=[early_stopping])

    y_pred = sentence_model.predict(testData[cnt])

    # Round y_pred to the nearest integer.
    y_pred = np.round(y_pred)
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_testData[cnt].values, y_pred, weights='quadratic')
    print("Kappa Score", cnt, ": {}".format(result))
    results.append(result)

print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(results).mean(),decimals=4))

sentence_models[results.index(max(results))].save('./final_gru.h5')


--------Fold 0--------

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_6 (GRU)                  (None, 128, 128)          126720    
_________________________________________________________________
gru_7 (GRU)                  (None, 64)                37248     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Kappa Score 0 : 0.0

--------Fold 1--------

Model: "sequential_6"
_______

Kappa Score 1 : 0.7615001829652579

--------Fold 2--------

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_10 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_11 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epo

Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Kappa Score 3 : 0.7672508889309195

--------Fold 4--------

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_14 (GRU)                 (None, 128, 128)          126720    
_________________________________________________________________
gru_15 (GRU)                 (None, 64)                37248     
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         


Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Kappa Score 4 : 0.7593355934631674
Average Kappa score after a 5-fold cross validation:  0.6075


IndexError: list index out of range

In [181]:
sentence_model.save('./final_gru.h5')

In [161]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
sentence_model = get_sentence_model()
sentence_model.fit(trainData[0], y_trainData[0], batch_size=64, epochs=50, callbacks=[early_stopping])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_2 (GRU)                  (None, 128, 128)          126720    
_________________________________________________________________
gru_3 (GRU)                  (None, 64)                37248     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 164,033
Trainable params: 164,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epo

<tensorflow.python.keras.callbacks.History at 0x1d9b23aebb0>

In [162]:
trainData[0][-1]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.05037698,  0.02560705, -0.02863991, ...,  0.20011078,
        -0.17437738,  0.115073  ],
       [ 0.11637801,  0.1543995 , -0.05205584, ...,  0.15015085,
        -0.22297533,  0.01466028],
       [ 0.26673853,  0.27861378,  0.17775764, ...,  0.10933428,
        -0.08046336,  0.09950728]], dtype=float32)

In [177]:
y_trainData[0]

2        50.0
3        80.0
4        60.0
6        80.0
7        80.0
         ... 
12966    17.0
12969    58.0
12970    50.0
12972    53.0
12973    67.0
Name: score, Length: 10380, dtype: float64

In [163]:
from sklearn.metrics import cohen_kappa_score

y_pred = sentence_model.predict(testData[0])
# Save any one of the 8 models.
#if count == 5:
#     sentence_model.save('./final_lstm.h5')

# Round y_pred to the nearest integer.
y_pred = np.round(y_pred)
# Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
result = cohen_kappa_score(y_testData[0].values, y_pred, weights='quadratic')
print("Kappa Score: {}".format(result))
#results.append(result)

#count += 1

Kappa Score: 0.7695417293933671


## Original Traning Phase

In [65]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

cv = KFold(n_splits=5, shuffle=True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        sentences += essay_to_sentences(essay, remove_stopwords = True)

    num_features = 200 
    
    model = embedding_dict
    
    # Generate training and testing data word vectors.
    clean_train_essays = []
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=5)
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 8 models.
    if count == 5:
         lstm_model.save('./final_lstm.h5')
            
    # Round y_pred to the nearest integer.
    y_pred = np.round(y_pred)
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1


--------Fold 1--------

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 1, 200)            320800    
_________________________________________________________________
lstm_7 (LSTM)                (None, 64)                67840     
_________________________________________________________________
dropout_17 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 388,705
Trainable params: 388,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Kappa Score: 0.0012404740275122617

--------Fold 2--------

Model: "sequential_18"
_________________________________________________________________
Layer (type)         

In [15]:
print("Average Kappa score after a 5-fold cross validation: ", np.round(np.array(results).mean(),decimals=4))

Average Kappa score after a 5-fold cross validation:  0.6427


In [16]:
import math
from gensim.test.utils import datapath

contentBad = """
    In “Let there be dark,” Paul Bogard talks about the importance of darkness.

Darkness is essential to humans. Bogard states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (Bogard 2). Here, Bogard talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy.

Animals also need darkness. Bogard states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles that come ashore to lay their eggs—and some are not, such as the bats that save American farmers billions in pest control and the moths that pollinate 80% of the world’s flora. Ecological light pollution is like the bulldozer of the night, wrecking habitat and disrupting ecosystems several billion years in the making. Simply put, without darkness, Earth’s ecology would collapse...” (Bogard 2). Here Bogard explains that animals, too, need darkness to survive.
""" 

contentGood = """
    In response to our world’s growing reliance on artificial light, writer Paul Bogard argues that natural darkness should be preserved in his article “Let There be dark”. He effectively builds his argument by using a personal anecdote, allusions to art and history, and rhetorical questions.

Bogard starts his article off by recounting a personal story – a summer spent on a Minnesota lake where there was “woods so dark that [his] hands disappeared before [his] eyes.” In telling this brief anecdote, Bogard challenges the audience to remember a time where they could fully amass themselves in natural darkness void of artificial light. By drawing in his readers with a personal encounter about night darkness, the author means to establish the potential for beauty, glamour, and awe-inspiring mystery that genuine darkness can possess. He builds his argument for the preservation of natural darkness by reminiscing for his readers a first-hand encounter that proves the “irreplaceable value of darkness.” This anecdote provides a baseline of sorts for readers to find credence with the author’s claims.

Bogard’s argument is also furthered by his use of allusion to art – Van Gogh’s “Starry Night” – and modern history – Paris’ reputation as “The City of Light”. By first referencing “Starry Night”, a painting generally considered to be undoubtedly beautiful, Bogard establishes that the natural magnificence of stars in a dark sky is definite. A world absent of excess artificial light could potentially hold the key to a grand, glorious night sky like Van Gogh’s according to the writer. This urges the readers to weigh the disadvantages of our world consumed by unnatural, vapid lighting. Furthermore, Bogard’s alludes to Paris as “the famed ‘city of light’”. He then goes on to state how Paris has taken steps to exercise more sustainable lighting practices. By doing this, Bogard creates a dichotomy between Paris’ traditionally alluded-to name and the reality of what Paris is becoming – no longer “the city of light”, but moreso “the city of light…before 2 AM”. This furthers his line of argumentation because it shows how steps can be and are being taken to preserve natural darkness. It shows that even a city that is literally famous for being constantly lit can practically address light pollution in a manner that preserves the beauty of both the city itself and the universe as a whole.

Finally, Bogard makes subtle yet efficient use of rhetorical questioning to persuade his audience that natural darkness preservation is essential. He asks the readers to consider “what the vision of the night sky might inspire in each of us, in our children or grandchildren?” in a way that brutally plays to each of our emotions. By asking this question, Bogard draws out heartfelt ponderance from his readers about the affecting power of an untainted night sky. This rhetorical question tugs at the readers’ heartstrings; while the reader may have seen an unobscured night skyline before, the possibility that their child or grandchild will never get the chance sways them to see as Bogard sees. This strategy is definitively an appeal to pathos, forcing the audience to directly face an emotionally-charged inquiry that will surely spur some kind of response. By doing this, Bogard develops his argument, adding gutthral power to the idea that the issue of maintaining natural darkness is relevant and multifaceted.

Writing as a reaction to his disappointment that artificial light has largely permeated the prescence of natural darkness, Paul Bogard argues that we must preserve true, unaffected darkness. He builds this claim by making use of a personal anecdote, allusions, and rhetorical questioning.
"""

def testContent(content):
    if len(content) > 20:
        num_features = 200
        clean_test_essays = []
        clean_test_essays.append(essay_to_wordlist( content, remove_stopwords=True ))
        testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
        testDataVecs = np.array(testDataVecs)
        testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

        preds = lstm_model.predict(testDataVecs)

        if math.isnan(preds):
            preds = 0
        else:
            preds = np.round(preds)

        if preds < 0:
            preds = 0
    else:
        preds = 0

    return preds
    
print("the SAT 1 score essay scored", testContent(contentBad))
print("the SAT 4 score essay scored", testContent(contentGood))

the SAT 1 score essay scored [[63.]]
the SAT 4 score essay scored [[78.]]


In [17]:
import pickle

# Pickle glove embeddings
with open('embeddings.pickle', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)