In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from scipy import spatial
from textblob import Word
from textblob import TextBlob
import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim
# model = gensim.models.KeyedVectors.load_word2vec_format('~/downloads/rnd/data/GoogleNews-vectors-negative300.bin.gz', binary=True)

from scipy import spatial
from sklearn.preprocessing import MinMaxScaler

In [None]:
def word_embed(word):
    try:
        vec = model[word]
        vec = vec.reshape(1,vec.shape[0])
    except:
    vec = np.ones((1, 300))*0.01
        #this is hardcoded
    return vec

def model_embed_demoted_ref(data):
    sentence  = data ['ref_demoted']
    sentence_array = [word_embed(word) for word in sentence.split()]
    return np.sum(sentence_array,axis=0)

def model_embed_demoted_stud(data):
    sentence  = data ['student_demoted']
    sentence_array = [word_embed(word) for word in sentence.split()]
    return np.sum(sentence_array,axis=0)

def model_embed_ref(data):
    sentence  = data ['ref_modified']
    sentence_array = [word_embed(word) for word in sentence.split()]
    return np.sum(sentence_array,axis=0)

def model_embed_stud(data):
    sentence  = data ['student_modified']
    sentence_array = [word_embed(word) for word in sentence.split()]
    return np.sum(sentence_array,axis=0)

## question Demoting functions

def student_demoting(data):
    return " ".join(x for x in data['student_modified'].split() if x not in data['qn_modified'])

def ref_demoting(data):
    return " ".join(x for x in data['ref_modified'].split() if x not in data['qn_modified'])


## Preprocessing

In [None]:
# df = pd.read_pickle("./nn_final_wt_ref.pkl")


#converting to lower case
df['qn_modified'] = df['question'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['ref_modified'] = df['ref_answer'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['student_modified'] = df['student_answer'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#punctuation removal
df['qn_modified'] = df['qn_modified'].str.replace('[^\w\s]','')
df['ref_modified'] = df['ref_modified'].str.replace('[^\w\s]','')
df['student_modified'] = df['student_modified'].str.replace('[^\w\s]','')

#stop word removal
stop = stopwords.words('english')
df['qn_modified'] = df['qn_modified'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['ref_modified'] = df['ref_modified'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['student_modified'] = df['student_modified'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#lemmatisation
df['qn_modified'] = df['qn_modified'].apply(lambda x: " ".join([Word(word).lemmatize() for word in word_tokenize(x)]))
df['ref_modified'] = df['ref_modified'].apply(lambda x: " ".join([Word(word).lemmatize() for word in word_tokenize(x)]))
df['student_modified'] = df['student_modified'].apply(lambda x: " ".join([Word(word).lemmatize() for word in word_tokenize(x)]))

#question demoting
df['student_demoted'] = df.apply(student_demoting,axis=1)
df['ref_demoted'] = df.apply(ref_demoting,axis=1)

#length ratio
df['length_ratio'] = df['student_modified'].apply(lambda x: len(x)) / df['ref_modified'].apply(lambda x: len(x))

#getting the word embeddings
df['embed_ref'] = df.apply(model_embed_ref,axis = 1)
df['embed_stud'] = df.apply(model_embed_stud,axis = 1)

df['embed_ref_demoted'] = df.apply(model_embed_demoted_ref,axis = 1)
df['embed_stud_demoted'] = df.apply(model_embed_demoted_stud,axis = 1)


# df.to_pickle('nn_final_wt_ref_embed.pkl')

## Numeric removal

In [None]:

##numeric removal
df['ref_modified'] = df['ref_modified'].apply(lambda x: " ".join(word for word in x.split() if not word.isdigit()))
df['student_modified'] = df['student_modified'].apply(lambda x: " ".join(word for word in x.split() if not word.isdigit()))


##numeric removal
df['ref_demoted'] = df['ref_demoted'].apply(lambda x: " ".join(word for word in x.split() if not word.isdigit()))
df['student_demoted'] = df['student_demoted'].apply(lambda x: " ".join(word for word in x.split() if not word.isdigit()))


## Ramesh code for word aligner.
Put this under
~/Semantic-Textual-Similarity/monolingualWordAligner

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from wordAligner import *
import sys
import pickle
import numpy as np
import pandas as pd
#sentence1 = "Four people died in accident. Well, United Arab Emirates is one of powerful country"
#sentence2 = "Seven men are dead due to collisions."



#print "sentence1 = ", sentence1
#print "sentence2 = ", sentence2


def align_sentence_demoted(data):
        sentence1 = str(data['ref_demoted'])
        sentence2 = str(data['student_demoted'])
        if len(sentence1) == 0 or len(sentence2)==0 :
                return []
        else:
             	#print type(sentence1),sentence1
                processing = Aligner(flag)
                aligned = processing.align_sentences(sentence1,sentence2)
                return aligned

def align_sentence(data):
        sentence1 = str(data['ref_modified'])
        sentence2 = str(data['student_modified'])
        if len(sentence1) == 0 or len(sentence2)==0 :
                return []
        else:
             	#print type(sentence1),sentence1
                processing = Aligner(flag)
                aligned = processing.align_sentences(sentence1,sentence2)
                return aligned


## Computing the cos similarity and alignement ratio

In [5]:
def cos_similarity_demo(data):
    return 1 - spatial.distance.cosine(data['embed_ref_demoted'],data['embed_stud_demoted'])

def cos_similarity(data):
    return 1 - spatial.distance.cosine(data['embed_ref'],data['embed_stud'])

def align_ratio(data):
    return (2*len(data['aligned'])) / (len(data['ref_answer'].split()) + len(data['student_answer'].split()))

def align_ratio_demo(data):
    return (2*len(data['aligned_demoted'])) / 0.1+(len(data['ref_demoted'].split()) + len(data['student_demoted'].split()))

In [None]:
#df['cos_similarity'] = df.apply(cos_similarity, axis=1)
#df['cos_similarity_demoted'] = df.apply(cos_similarity_demo, axis=1)

#getting aligned scores
#df['aligned_score'] = df.apply(align_ratio, axis=1)
#df['aligned_score_demo'] = df.apply(align_ratio_demo, axis=1)


## Handling zeros and NAN

In [None]:
df.fillna(-1, inplace=True)
mms = MinMaxScaler()
df[['length_ratio', 'align_ratio', 'align_ratio_demoted', 'cos_similarity', 'cos_similarity_demoted']] = \
mms.fit_transform(df[['length_ratio', 'align_ratio', 'align_ratio_demoted','cos_similarity', 'cos_similarity_demoted']])


print (np.sum(df['cos_similarity']<0))
print (np.sum(df['cos_similarity'].isnull()))
print (np.sum(df['cos_similarity_demo']<0))
print (np.sum(df['cos_similarity_demo'].isnull()))

## Normalisation


## modifying the preprocessing.(numeric removal ignoring)

In [None]:
#converting to lower case
df['qn_modified'] = df['question'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['ref_modified'] = df['ref_answer'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['student_modified'] = df['student_answer'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#punctuation removal
df['qn_modified'] = df['qn_modified'].str.replace('[^\w\s]','')
df['ref_modified'] = df['ref_modified'].str.replace('[^\w\s]','')
df['student_modified'] = df['student_modified'].str.replace('[^\w\s]','')

#stop word removal
stop = stopwords.words('english')
df['qn_modified'] = df['qn_modified'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['ref_modified'] = df['ref_modified'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['student_modified'] = df['student_modified'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#lemmatisation
df['qn_modified'] = df['qn_modified'].apply(lambda x: " ".join([Word(word).lemmatize() for word in word_tokenize(x)]))
df['ref_modified'] = df['ref_modified'].apply(lambda x: " ".join([Word(word).lemmatize() for word in word_tokenize(x)]))
df['student_modified'] = df['student_modified'].apply(lambda x: " ".join([Word(word).lemmatize() for word in word_tokenize(x)]))

## Rough

In [18]:
df = pd.read_pickle("../../dataset/final_dataset/mohler_final.pkl")
len(df)

2442

In [4]:
#punctuation removal
df['qn_modified'] = df['qn_modified'].str.replace('[^\w\s]','')
df['ref_modified'] = df['ref_modified'].str.replace('[^\w\s]','')
df['student_modified'] = df['student_modified'].str.replace('[^\w\s]','')

In [10]:
df['ref_modified'] = df['ref_modified'].str.replace('_','')
df['student_modified'] = df['student_modified'].str.replace('_','')
df['student_demoted'] = df['student_demoted'].str.replace('_','')
df['ref_demoted'] = df['ref_demoted'].str.replace('_','')
df['student_answer'] = df['student_answer'].str.replace('_','')

In [43]:
df = pd.read_pickle("../../../../../nn_final_wt_ref_embed.pkl")
def cos_similarity_demo(data):
    return 1 - spatial.distance.cosine(data['embed_ref_demoted'],data['embed_stud_demoted'])

def cos_similarity(data):
    return 1 - spatial.distance.cosine(data['embed_ref'],data['embed_stud'])

def align_ratio(data):
    return (2*len(data['aligned'])) / (len(data['ref_answer'].split()) + len(data['student_answer'].split()))

def align_ratio_demo(data):
    return (2*len(data['aligned_demoted'])) / (1e-5+(len(data['ref_demoted'].split()) + len(data['student_demoted'].split())))


df['cos_similarity'] = df.apply(cos_similarity, axis=1)
df['cos_similarity_demo'] = df.apply(cos_similarity_demo, axis=1)

#getting aligned scores
df['aligned_score'] = df.apply(align_ratio, axis=1)
df['aligned_score_demo'] = df.apply(align_ratio_demo, axis=1)
df.fillna(0, inplace=True)
mms = MinMaxScaler()
df[['length_ratio', 'aligned_score', 'aligned_score_demo', 'cos_similarity', 'cos_similarity_demo']] = \
mms.fit_transform(df[['length_ratio', 'aligned_score', 'aligned_score_demo','cos_similarity', 'cos_similarity_demo']])

print (np.sum(df['cos_similarity']<0))
print (np.sum(df['cos_similarity'].isnull()))
print (np.sum(df['cos_similarity_demo']<0))
print (np.sum(df['cos_similarity_demo'].isnull()))

0
0
0
0


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [45]:
df.to_pickle("../../dataset/final_dataset/nn_final.pkl")

In [46]:
df.head()

Unnamed: 0,question,student_answer,grades_round,student_modified,ref_answer,qn_modified,ref_modified,student_demoted,ref_demoted,length_ratio,embed_ref,embed_stud,embed_ref_demoted,embed_stud_demoted,aligned,aligned_demoted,cos_similarity,cos_similarity_demo,aligned_score,aligned_score_demo
0,"Give a definition for the term ""artificial ne...",An artificial neural network is a massively pa...,2,artificial neural network massively parallel d...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,massively parallel distributed processor simpl...,massively parallel distributed processor made ...,0.251889,"[[1.5640869, 1.7378178, -0.1736145, 2.0961304,...","[[2.2006836, 0.86382484, 0.27182007, 2.5562744...","[[1.6300049, 1.5985355, -0.1282959, 1.0488892,...","[[2.0412598, 0.49321938, 0.10058594, 1.2648926...","[[neural, neural], [network, network], [massiv...","[[simple, simple], [processing, processing], [...",0.947867,0.933466,0.969697,0.950888
1,"Give a definition for the term ""artificial ne...",Artificial neural network consists of: . Large...,2,artificial neural network consists largely par...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,consists largely parallel distributed processo...,massively parallel distributed processor made ...,0.232759,"[[1.5640869, 1.7378178, -0.1736145, 2.0961304,...","[[1.335439453125, 1.0990445709228516, 0.529989...","[[1.6300049, 1.5985355, -0.1282959, 1.0488892,...","[[1.1956689453125, 0.7539517974853516, 0.13561...","[[knowledge, knowledge], [parallel, parallel],...","[[knowledge, knowledge], [knowledge, knowledge...",0.964398,0.951182,0.883259,0.818713
2,"Give a definition for the term ""artificial ne...",An artificial neural network is a massive dist...,1,artificial neural network massive distributed ...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,massive distributed processor consists several...,massively parallel distributed processor made ...,0.102828,"[[1.5640869, 1.7378178, -0.1736145, 2.0961304,...","[[0.41577148, -0.37836266, 0.22351074, 0.95300...","[[1.6300049, 1.5985355, -0.1282959, 1.0488892,...","[[0.38427734, -0.48944664, 0.17224121, 0.55065...","[[knowledge, knowledge], [neural, neural], [ne...","[[knowledge, knowledge], [distributed, distrib...",0.854767,0.775333,0.498039,0.465632
3,"Give a definition for the term ""artificial ne...",An ANN is a layered graphical model containing...,2,ann layered graphical model containing neuron ...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,ann layered graphical model containing neuron ...,massively parallel distributed processor made ...,0.327616,"[[1.5640869, 1.7378178, -0.1736145, 2.0961304,...","[[2.1478271, 1.4641495, -0.3640442, 0.5910034,...","[[1.6300049, 1.5985355, -0.1282959, 1.0488892,...","[[1.9754639, 1.1296768, -0.6564026, 0.30181885...","[[resemble, resembling], [neural, neuron], [le...","[[environment, environment], [learning, traini...",0.788166,0.735229,0.32295,0.220386
4,"Give a definition for the term ""artificial ne...",Artificial Neural Networks are large parallel ...,2,artificial neural network large parallel proce...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,large parallel processing unit natural ability...,massively parallel distributed processor made ...,0.286963,"[[1.5640869, 1.7378178, -0.1736145, 2.0961304,...","[[0.8804833984375, 1.3045060729980469, -0.4420...","[[1.6300049, 1.5985355, -0.1282959, 1.0488892,...","[[0.687978515625, 0.7240617370605469, -0.85735...","[[knowledge, knowledge], [processing, processi...","[[knowledge, knowledge], [processing, processi...",0.894408,0.828665,0.585639,0.482094


In [37]:


print (np.sum(df['cos_similarity']<0))
print (np.sum(df['cos_similarity'].isnull()))
# print (np.sum(df['cos_similarity_demo']<0))
# print (np.sum(df['cos_similarity_demo'].isnull()))

0
40


In [2]:
df1 = pd.read_pickle("../../dataset/final_dataset/mohler_final.pkl")
df1.columns

Index(['question_id', 'question', 'ref_answer', 'student_answer', 'grade',
       'qn_modified', 'ref_modified', 'student_modified', 'student_demoted',
       'ref_demoted', 'length_ratio', 'grades_round', 'embed_ref',
       'embed_stud', 'aligned', 'aligned_demoted', 'embed_ref_demoted',
       'embed_stud_demoted', 'cos_similarity', 'cos_similarity_demo',
       'aligned_score', 'aligned_score_demo', 'status'],
      dtype='object')

In [8]:
df = pd.read_pickle("../../dataset/final_dataset/sem_eval_train.pkl")
df.columns

Index(['Unnamed: 0', 'question', 'question_id', 'ref_answer', 'student_answer',
       'result', 'grades_round', 'student_modified', 'qn_modified',
       'ref_modified', 'student_demoted', 'ref_demoted', 'length_ratio',
       'embed_ref', 'embed_stud', 'embed_ref_demoted', 'embed_stud_demoted',
       'aligned', 'aligned_demoted', 'cos_similarity', 'cos_similarity_demo',
       'aligned_score', 'aligned_score_demo', 'status'],
      dtype='object')

In [5]:
df.rename(columns={"ques_id":"question_id","reference_answer":"ref_answer"},inplace=True)

In [7]:
df.to_pickle("../../dataset/final_dataset/sem_eval_train.pkl")