In [1]:
# !git clone https://github.com/Saiteja-Reddy/Automatic-Text-Scoring.git
import  keras.layers  as  klayers 
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, LSTM, Input, Embedding, GlobalAveragePooling1D, Concatenate, Activation, Lambda, BatchNormalization, Convolution1D, Dropout
from keras.preprocessing.text import Tokenizer
import numpy as np
import nltk
from quadratic_weighted_kappa import QWK
from sklearn.metrics import cohen_kappa_score
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras import initializers
from scipy import stats

Using TensorFlow backend.


In [2]:
class Neural_Tensor_layer(Layer):
	def __init__(self,output_dim,input_dim=None, **kwargs):
		self.output_dim=output_dim
		self.input_dim=input_dim
		if self.input_dim:
			kwargs['input_shape']=(self.input_dim,)
# 		print("YAYY", input_dim, output_dim)
		super(Neural_Tensor_layer,self).__init__(**kwargs)

	def call(self,inputs,mask=None):
		e1=inputs[0]
		e2=inputs[1]
		batch_size=K.shape(e1)[0]
		k=self.output_dim
		

		feed_forward=K.dot(K.concatenate([e1,e2]),self.V)

		bilinear_tensor_products = [ K.sum((e2 * K.dot(e1, self.W[0])) + self.b, axis=1) ]

		for i in range(k)[1:]:	
			btp=K.sum((e2*K.dot(e1,self.W[i]))+self.b,axis=1)
			bilinear_tensor_products.append(btp)

		result=K.tanh(K.reshape(K.concatenate(bilinear_tensor_products,axis=0),(batch_size,k))+feed_forward)

		return result
    
	def build(self,input_shape):
		mean=0.0
		std=1.0
		k=self.output_dim
		d=self.input_dim
		##truncnorm generate continuous random numbers in given range
		W_val=stats.truncnorm.rvs(-2 * std, 2 * std, loc=mean, scale=std, size=(k,d,d))
		V_val=stats.truncnorm.rvs(-2 * std, 2 * std, loc=mean, scale=std, size=(2*d,k))
		self.W=K.variable(W_val)
		self.V=K.variable(V_val)
		self.b=K.zeros((self.input_dim,))
		self.trainable_weights=[self.W,self.V,self.b]    

	def compute_output_shape(self, input_shape):
		batch_size=input_shape[0][0]
		return(batch_size,self.output_dim)

In [3]:
class Temporal_Mean_Pooling(Layer): # conversion from (samples,timesteps,features) to (samples,features)
	def __init__(self, **kwargs):
		super(Temporal_Mean_Pooling,self).__init__(**kwargs)
		# masked values in x (number_of_samples,time)
		self.supports_masking=True
		# Specifies number of dimensions to each layer
		self.input_spec=InputSpec(ndim=3)
        
	def call(self,x,mask=None):
		if mask is None:
			mask=K.mean(K.ones_like(x),axis=-1)

		mask=K.cast(mask,K.floatx())
				#dimension size single vec/number of samples
		return K.sum(x,axis=-2)/K.sum(mask,axis=-1,keepdims=True)        

	def compute_mask(self,input,mask):
		return None
    
    
	def compute_output_shape(self,input_shape):
		return (input_shape[0],input_shape[2])

In [27]:
EMBEDDING_DIM=300
MAX_NB_WORDS=4000

MAX_SEQUENCE_LENGTH=500
VALIDATION_SPLIT=0.20
DELTA=20

texts=[]
labels=[]
sentences=[]

originals = []

fp1=open("glove.6B.300d.txt","r", encoding="utf-8")
glove_emb={}
for line in fp1:
	temp=line.split(" ")
	glove_emb[temp[0]]=np.asarray([float(i) for i in temp[1:]])

print("Embedding done")

Embedding done


In [28]:
essay_type = '10'

fp=open("data/train_rel_2.tsv",'r', encoding="ascii", errors="ignore")
fp.readline()
originals = []
for line in fp:
    temp=line.split("\t")
    if(temp[1]==essay_type): ## why only 4 ?? - evals in prompt specific fashion
        originals.append(float(temp[2]))
# print(originals)
fp.close()
# print(originals)
print("range min - ", min(originals) , " ; range max - ", max(originals))

range_min = min(originals)
range_max = max(originals)

# range_min = 1
# range_max = 6

fp=open("data/train_rel_2.tsv",'r', encoding="ascii", errors="ignore")
fp.readline()
sentences=[]
for line in fp:
    temp=line.split("\t")
    if(temp[1]==essay_type): ## why only 4 ?? - evals in prompt specific fashion
        texts.append(temp[4])
        labels.append((float(temp[2])-range_min)/(range_max-range_min)) ## why ??  - normalize to range [0-1]
        line=temp[4].strip()
#         print(line)
        sentences.append(nltk.tokenize.word_tokenize(line))

fp.close()

range min -  0.0  ; range max -  2.0


In [29]:
labels
print(len(labels))
print(len(sentences))
print(len(texts))

1640
1640
1640


In [30]:
print("text labels appended %s" %len(texts))

labels=np.asarray(labels)
print(labels)
print(len(labels))

text labels appended 1640
[1.  0.5 1.  ... 0.5 0.5 0. ]
1640


In [31]:
for i in sentences:
	temp1=np.zeros((1, EMBEDDING_DIM))
	for w in i:
		if(w in glove_emb):
			temp1+=glove_emb[w]
	temp1/=len(i)

In [32]:
tokenizer=Tokenizer(num_words = MAX_NB_WORDS) #num_words=MAX_NB_WORDS) #limits vocabulory size
tokenizer.fit_on_texts(texts) #encoding the text
sequences=tokenizer.texts_to_sequences(texts) #returns list of sequences
word_index=tokenizer.word_index #dictionary mapping, word and specific token for that word...
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) #padding to max_length


print('Shape of data tensor:', data.shape)

Found 2560 unique tokens.
Shape of data tensor: (1640, 500)


In [33]:
indices=np.arange(data.shape[0]) #with one argument, start=0, step =1
print(data.shape)
np.random.shuffle(indices)
data=data[indices]
# print(data.shape)
labels=labels[indices]
# np.reshape(labels, ())
print(labels.shape)
validation_size=int(VALIDATION_SPLIT*data.shape[0])
print(validation_size)

(1640, 500)
(1640,)
328


In [34]:
x_train=data[:-validation_size] #data-validation data
print(x_train.shape)
# print(x_train)
# print(labels)
y_train=labels[:-validation_size]
# print(y_train.transpose)
print(y_train.shape)
# y_train = np.reshape(y_train, (1427, 1))
# print(y_train_new)
# print(y_train)
x_val=data[-validation_size:]
print(x_val.shape)
y_val=labels[-validation_size:]

(1312, 500)
(1312,)
(328, 500)


In [35]:
embedding_matrix = np.zeros((len(word_index), EMBEDDING_DIM))
print(embedding_matrix.shape)

(2560, 300)


In [36]:
for word,i in word_index.items():
	if(i>=len(word_index)):
		continue
	if word in glove_emb:
			embedding_matrix[i]=glove_emb[word]
vocab_size=len(word_index)
print(vocab_size)

2560


In [37]:
embedding_layer=Embedding(vocab_size,EMBEDDING_DIM,weights=[embedding_matrix],
							input_length=MAX_SEQUENCE_LENGTH,
							mask_zero=True,
							trainable=False)
# print(embedding_layer.shape)
side_embedding_layer=Embedding(vocab_size,EMBEDDING_DIM,weights=[embedding_matrix],
							input_length=MAX_SEQUENCE_LENGTH,
							mask_zero=False,
							trainable=False)

In [38]:
def SKIPFLOW(lstm_dim=50, lr=1e-4, lr_decay=1e-6, k=4, eta=3, delta=50, activation="relu", maxlen=MAX_SEQUENCE_LENGTH, seed=None):
    e = Input(name='essay',shape=(maxlen,))
    print("e", e)
#     trad_feats=Input(shape=(7,))
#     print("trad_feats", trad_feats)
    embed = embedding_layer(e)
    print(embed.shape)
    lstm_layer=LSTM(lstm_dim,return_sequences=True)
    print(lstm_layer)
    hidden_states=lstm_layer(embed)
    htm=Temporal_Mean_Pooling()(hidden_states)    
    side_embed = side_embedding_layer(e)
    side_hidden_states=lstm_layer(side_embed)    
    tensor_layer=Neural_Tensor_layer(output_dim=k,input_dim=lstm_dim)
#     print(input_dim, output_dim)
    pairs = [((eta + i * delta) % maxlen, (eta + i * delta + delta) % maxlen) for i in range(maxlen // delta)]
    hidden_pairs = [ (Lambda(lambda t: t[:, p[0], :])(side_hidden_states), Lambda(lambda t: t[:, p[1], :])(side_hidden_states)) for p in pairs]
    sigmoid = Dense(1, activation="sigmoid", kernel_initializer=initializers.glorot_normal(seed=seed))
    coherence = [sigmoid(tensor_layer([hp[0], hp[1]])) for hp in hidden_pairs]
    co_tm=Concatenate()(coherence[:]+[htm])
    dense = Dense(256, activation=activation,kernel_initializer=initializers.glorot_normal(seed=seed))(co_tm)
    dense = Dense(128, activation=activation,kernel_initializer=initializers.glorot_normal(seed=seed))(dense)
    dense = Dense(64, activation=activation,kernel_initializer=initializers.glorot_normal(seed=seed))(dense)
    out = Dense(1, activation="sigmoid")(dense)
    model = Model(inputs=[e], outputs=[out])
    print("input", [e])
    print("outputs", out)
    adam = Adam(lr=lr, decay=lr_decay)
    model.compile(loss="mean_squared_error", optimizer=adam, metrics=["MSE"])
    return model

In [39]:
# from keras.utils.vis_utils import plot_model
earlystopping = EarlyStopping(monitor="val_mean_squared_error", patience=5)
sf_1 = SKIPFLOW(lstm_dim=500, lr=2e-4, lr_decay=2e-6, k=4, eta=13, delta=50, activation="relu", seed=None)
sf_1.summary()
# plot_model(sf_1)

e Tensor("essay_1:0", shape=(?, 500), dtype=float32)
(?, 500, 300)
<keras.layers.recurrent.LSTM object at 0x7f05c24db198>
input [<tf.Tensor 'essay_1:0' shape=(?, 500) dtype=float32>]
outputs Tensor("dense_10/Sigmoid:0", shape=(?, 1), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
essay (InputLayer)              (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 500, 300)     768000      essay[0][0]                      
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 500, 500)     1602000     embedding_3[0][0]                
                                                          

In [40]:
# print(sf_1)
epochs = 100
# epochs = 1000
print(type(x_train))
# y_train = np.asarray(y_train)
print(type(y_train))

sf_1.fit(x_train, y_train, batch_size=32, epochs=epochs, validation_data=([x_val], y_val), callbacks=[earlystopping])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Train on 1312 samples, validate on 328 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


<keras.callbacks.History at 0x7f05c3372f60>

In [41]:
y_pred=sf_1.predict([x_val])
y_pred

array([[0.51576495],
       [0.9658915 ],
       [0.47671583],
       [0.91740537],
       [0.42550012],
       [0.43072212],
       [0.9679698 ],
       [0.93470377],
       [0.08579108],
       [0.46050602],
       [0.50039715],
       [0.62147593],
       [0.19804919],
       [0.36389017],
       [0.8850249 ],
       [0.5059503 ],
       [0.05699658],
       [0.95597506],
       [0.9810964 ],
       [0.4625445 ],
       [0.96305126],
       [0.963956  ],
       [0.9697412 ],
       [0.49433717],
       [0.47930503],
       [0.4753928 ],
       [0.50283283],
       [0.02202386],
       [0.57478154],
       [0.9709096 ],
       [0.50503075],
       [0.52067983],
       [0.9449668 ],
       [0.08881131],
       [0.4514292 ],
       [0.44479707],
       [0.48355597],
       [0.48702008],
       [0.47781163],
       [0.9753257 ],
       [0.4284686 ],
       [0.97958887],
       [0.42631036],
       [0.3769688 ],
       [0.9657771 ],
       [0.89152646],
       [0.1464966 ],
       [0.423

In [42]:
y_val_fin = [int(round(a*(range_max-range_min)+range_min)) for a in y_val]
print(y_val_fin)

[1, 2, 0, 2, 1, 1, 2, 1, 0, 1, 0, 0, 1, 0, 2, 1, 0, 2, 0, 1, 1, 2, 2, 1, 1, 1, 1, 0, 1, 0, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 2, 1, 1, 1, 2, 0, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 0, 2, 1, 1, 1, 1, 1, 0, 2, 1, 2, 2, 2, 1, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 1, 2, 1, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 0, 1, 2, 1, 1, 2, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 1, 2, 1, 0, 1, 1, 1, 0, 2, 1, 2, 1, 2, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 0, 2, 0, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 0, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 1, 1, 2, 0, 2, 1, 2, 2, 2, 1, 2, 1, 1, 0, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0, 2, 0, 2, 0, 1, 1, 0, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 1, 1, 1, 0, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 2, 2, 0, 1, 0, 0, 2, 1, 1, 1, 1, 0, 2, 1, 2, 2, 1, 1]


In [44]:
y_pred_fin =[int(round(a*(range_max-range_min)+range_min)) for a in y_pred.reshape(328).tolist()]
print(y_pred_fin)

[1, 2, 1, 2, 1, 1, 2, 2, 0, 1, 1, 1, 0, 1, 2, 1, 0, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0, 1, 2, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 0, 2, 1, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 0, 1, 1, 2, 1, 2, 1, 2, 0, 2, 2, 1, 2, 2, 1, 0, 1, 2, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1, 2, 2, 1, 1, 2, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 0, 2, 2, 0, 1, 1, 2, 2, 1, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 0, 1, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 0, 1, 2, 2, 2, 0, 1, 2, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 1, 1, 0, 1, 0, 2, 0, 2, 2, 1, 1, 0, 1, 2, 2, 2, 1, 2, 2, 1, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 1, 0, 2, 1, 2, 1, 0, 2, 0, 0, 2, 1, 1, 1, 1, 0, 2, 1, 2, 2, 0, 1]


In [45]:
print(cohen_kappa_score(y_val_fin,y_pred_fin,weights="quadratic"))

0.6933831897252479


In [46]:
def Cmatrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def QWK_new(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = Cmatrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [47]:
QWK_new(y_val_fin, y_pred_fin)

0.6933831897252478

In [48]:
sf_1.save('model_final/10_model.h5')

In [49]:
y_pred*(range_max-range_min)+range_min

array([[1.0315299 ],
       [1.931783  ],
       [0.95343167],
       [1.8348107 ],
       [0.85100025],
       [0.86144423],
       [1.9359396 ],
       [1.8694075 ],
       [0.17158216],
       [0.92101204],
       [1.0007943 ],
       [1.2429519 ],
       [0.39609838],
       [0.72778034],
       [1.7700498 ],
       [1.0119005 ],
       [0.11399317],
       [1.9119501 ],
       [1.9621928 ],
       [0.925089  ],
       [1.9261025 ],
       [1.927912  ],
       [1.9394825 ],
       [0.98867434],
       [0.95861006],
       [0.9507856 ],
       [1.0056657 ],
       [0.04404771],
       [1.1495631 ],
       [1.9418192 ],
       [1.0100615 ],
       [1.0413597 ],
       [1.8899336 ],
       [0.17762262],
       [0.9028584 ],
       [0.88959414],
       [0.96711195],
       [0.97404015],
       [0.95562327],
       [1.9506514 ],
       [0.8569372 ],
       [1.9591777 ],
       [0.8526207 ],
       [0.7539376 ],
       [1.9315542 ],
       [1.7830529 ],
       [0.2929932 ],
       [0.847

In [50]:
sf_1.save_weights('weights_final/10_weights.h5')

In [31]:
!pip install pydot
# !pip install graphviz
from keras.utils import plot_model
plot_model(sf_1)

Collecting pyparsing>=2.1.4
  Using cached pyparsing-2.4.6-py2.py3-none-any.whl (67 kB)
Installing collected packages: pyparsing
Successfully installed pyparsing-2.4.6


ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

Embedding done


In [18]:
0.7132881562982345

0.7132881562982345

In [None]:
0.7132881562982345 1
0.5567800670819358 2
0.5590659472768016 3
0.6475944737402457 4
0.705901006844428  5
0.7511181679935593 6
0.5626685892575236 7
0.5539630085049697 8

0.6933831897252478 10