In [1]:
import os 
import codecs
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

#Used to generate the same set of random numbers when random function is called
np.random.seed(7)

#Read The TSV Data File
data = pd.read_table('/home/monark/LEARNINGS/Projects/Automated_Grading_System/train.tsv')
data1 = pd.read_table('train_rel_2.tsv')

#Because of small size of the dataset O merged both the files given for training
data = [data, data1]
data = pd.concat(data)


essay_text = data['EssayText']
essay_score = data['Score1']
essay_set = data['EssaySet']

In [2]:
## 
# RAW DATA ENCODING
# Format the text samples and labels into tensors that can be fed into a neural network
##
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

sets = [0,1,2,3] # The 4 different classes of Scores

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

for label in sets:
	label_id = len(labels_index)
	labels_index[label] = label_id
	for t in essay_text[data['Score1']==label]:
		texts.append(t)
		labels.append(label_id)


print('Found %s texts.' % len(texts))

top_words = 5000 #top most-frequent words extracted from the dataset


tokenizer = Tokenizer(nb_words=top_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

max_response_length = 500
data = pad_sequences(sequences, maxlen=max_response_length)

#Convert class vector to binary class matrix, for use with categorical_crossentropy.
#Or in simple words to convert numbers into ONE-HOT Vector for MultiClass classification
labels = to_categorical(np.asarray(labels))

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


#Shuffle the dataset 
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]


Using Theano backend.
Using gpu device 0: GeForce 820M (CNMeM is enabled with initial size: 85.0% of memory, cuDNN not available)


Found 34250 texts.
Found 16729 unique tokens.
Shape of data tensor: (34250, 500)
Shape of label tensor: (34250, 4)


In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

#Set of stopwords from nltk.corpus package
stop = set(stopwords.words('english'))

word_counts_X = []  # List of word Counts in each example response
for x in essay_text:
    word_counts_X.append(len([i for i in word_tokenize(x) if i not in stop]))    

In [4]:
sent_counts = []  # Set of sentence counts in each example response
for x in essay_text:
    sent_counts.append(len(sent_tokenize(x)))

In [5]:
#Shuffle all of the data
word_counts_X = np.array(word_counts_X)
word_counts_X = word_counts_X[indices]

sent_counts = np.array(sent_counts)
sent_counts = sent_counts[indices]

essay_set = np.array(essay_set)
essay_set = essay_set[indices]

In [6]:
#To check whether the dataset is balanced or not
from collections import Counter
Counter(essay_set)


Counter({1: 3344,
         2: 2556,
         3: 3699,
         4: 3395,
         5: 3590,
         6: 3594,
         7: 3598,
         8: 3598,
         9: 3596,
         10: 3280})

In [18]:
#Size for train-test split
train_size = int(0.85 * len(data))

X_train = data[:train_size]
X_test = data[train_size:]

set_train = essay_set[:train_size]
set_test = essay_set[train_size:]

sent_count_train = sent_counts[:train_size]
sent_count_test = sent_counts[train_size:]

word_count_test = word_counts_X[train_size:]
word_count_train = word_counts_X[:train_size]

#Concatenate the three data-vectors into a single matrix or feature-set
features_train = np.column_stack((set_train,sent_count_train,word_count_train))
features_test = np.column_stack((set_test,sent_count_test,word_count_test))

y_train = labels[:train_size]
y_test = labels[train_size:]

print(X_train.shape)
print(features_train.shape)

(29112, 500)
(29112, 3)


In [23]:
#Model Design
#Functional Model

from keras.layers import Dense, Dropout, Activation, Input
from keras.layers import LSTM,merge
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.layers.advanced_activations import ELU
from keras.regularizers import l2

#Embedding-Output Vector Length
embedding_vector_length = 32

#Define Text-Input
text_in = Input(shape=(500,), name='text')

#Embeddings
#Used to convert the encoded data i.e the matrix of indices into a form compatible with LSTM
embedding = Embedding(output_dim=embedding_vector_length, input_dim=top_words, input_length=500)(text_in)

#LSTM
lstm_1= LSTM(100, return_sequences = True)(embedding)
lstm_2 = LSTM(150)(lstm_1)

#Features Inputs (essay_set, word_counts, sent_counts)
features_in = Input(shape=(3,), name='features')

#Merge layer to merge the output of LSTM and the feature inputs
x = merge([lstm_2, features_in], mode='concat')

#Dropout for the hidden_units to be independent from each other
dropout = Dropout(0.2)(x)

#Hidden Dense Layer
D1 = Dense(200,W_regularizer=l2(0.01),b_regularizer = l2(0.01))(dropout) # try 150 dense after this
ED1 = ELU()(D1)

# Final Dense Output-Layer
score = Dense(4, activation='softmax', name='score',W_regularizer=l2(0.01),b_regularizer = l2(0.01))(ED1) 

#model
model = Model(input=[text_in, features_in], output=[score])

#optimizer
adam = Adam(lr = 0.001)#, decay = 1e-4) 
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
text (InputLayer)                (None, 500)           0                                            
____________________________________________________________________________________________________
embedding_7 (Embedding)          (None, 500, 32)       160000      text[0][0]                       
____________________________________________________________________________________________________
lstm_13 (LSTM)                   (None, 500, 100)      53200       embedding_7[0][0]                
____________________________________________________________________________________________________
lstm_14 (LSTM)                   (None, 150)           150600      lstm_13[0][0]                    
___________________________________________________________________________________________

In [26]:
#configure the learning process
model.compile(optimizer=adam, loss='categorical_crossentropy',metrics=['accuracy'])

#Start Training
model.fit([X_train,features_train], y_train, nb_epoch = 20, batch_size=128, validation_split=0.1)  # we pass one data array per model input

Train on 26200 samples, validate on 2912 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa36b53a2b0>

In [27]:
#Save The Trained Model 
model.save('functional_model.h5')

#Predict Score For the test set for calculating accuracy
scores = model.evaluate([X_test,features_test], y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.61%
