In [1]:
# Imports
import json, os, re, shutil, sys, time
import seaborn as sns
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML
from collections import defaultdict
import xmltodict
import untangle
import xml.etree.ElementTree as ET
# NLTK for NLP utils and corpora
import nltk
from nltk.corpus import treebank
from nltk.text import Text
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

# Import spacy
import spacy

# NumPy, Pandas and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf

from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

from numpy.random import seed
from pandas import read_csv, DataFrame
from sklearn.preprocessing import minmax_scale

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Dropout, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.layers.embeddings import Embedding

## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score

Using TensorFlow backend.


In [2]:
# Get AESW train Data. It is in XML
tree = ET.parse('../data-DNC/AESW/aesw2016_v1.2_train.xml')
root = tree.getroot()

In [6]:
# Create a dictionary which has sentence ID as key, and the sentence as value
sent_dict = dict()
for sent in root.iter('sentence'):
    sid = sent.attrib['sid']
    sent_dict[sid]=sent.text

In [5]:
len(sent_dict)

1189412

In [7]:
# Let us load the tok file which is in tsv format.
aesw_train_labels = pd.read_csv("../data-DNC/AESW/aesw2016_v1.2_train.tok",sep='\t',encoding = "latin1",header=None)
aesw_train_labels.head(5)

Unnamed: 0,0,1,2
0,1,1.0,To facilitate an easier notation throughout t...
1,-1,1.0,To facilitate an easier notation throughout t...
2,0,1.1,Therefore MATH defines a special order of tim...
3,0,1.2,This is important since only MATH is the real...
4,0,1.3,Note that in all contour time-integrals we es...


It seems that token file is sufficient for us as it contains sentences and their correction. We do not need the xml file anymore

In [8]:
#Change column names
aesw_train_labels.columns = ['Key_ID','Sent_ID','Text']

# Key_ID of 1 indicates corrected sentence, and -1 is pre-corrected. 0 means no grammatical error
# We can get rid of corrected sentence, as we are not looking for particulars of grammar.
# Just a binary classification of error or not is sufficient.

aesw_train_labels = aesw_train_labels[aesw_train_labels.Key_ID!=1]
len(aesw_train_labels)

1196903

In [9]:
# Change the key_id of error sentences to "1" instead of "-1"
mask = aesw_train_labels.Key_ID == -1
column_name = 'Key_ID'
aesw_train_labels.loc[mask, column_name] = 1

In [10]:
# Verify if we have only the error and non-error sentences
aesw_train_labels.head(10)

Unnamed: 0,Key_ID,Sent_ID,Text
1,1,1.0,To facilitate an easier notation throughout t...
2,0,1.1,Therefore MATH defines a special order of tim...
3,0,1.2,This is important since only MATH is the real...
4,0,1.3,Note that in all contour time-integrals we es...
5,0,2.0,Theorem REF proves the equivalence of ensembl...
6,0,2.1,"Set the magnetic field MATH , call MATH the f..."
8,1,2.2,"We claim that MATHDISP where , as before , MA..."
10,1,2.3,In fact by the general theory after the limit...
12,1,3.0,"Inspired by Clegg -LRB- 2002 -RRB- , the benc..."
13,0,3.1,"It can be concluded that , usually , calculat..."


In [11]:
# Split training data into x and y (labels)
x_train = aesw_train_labels.Text.as_matrix(columns=None)
y_train = aesw_train_labels.Key_ID.as_matrix(columns=None)

In [12]:
# Repeat the procedure for dev data
aesw_dev_labels = pd.read_csv("../data-DNC/AESW/aesw2016_v1.2_dev.tok",sep='\t',encoding = "latin1",header=None)
aesw_dev_labels.columns = ['Key_ID','Sent_ID','Text']
aesw_dev_labels = aesw_dev_labels[aesw_dev_labels.Key_ID!=1]
mask = aesw_dev_labels.Key_ID == -1
column_name = 'Key_ID'
aesw_dev_labels.loc[mask, column_name] = 1
x_dev = aesw_dev_labels.Text.as_matrix(columns=None)
y_dev = aesw_dev_labels.Key_ID.as_matrix(columns=None)
aesw_dev_labels.head(5)

Unnamed: 0,Key_ID,Sent_ID,Text
0,0,1.0,The team of robots in Chapter REF and CITE cr...
2,1,1.1,The Hamiltonian -LRB- Lyapunov function -RRB-...
4,1,1.2,"Therefore , for MATH robots the time derivati..."
5,0,1.3,The estimator/guidance algorithm for finding ...
7,1,1.4,The feedback controller is MATHDISP and the s...


In [13]:
# Use Keras Tokenizer to tokenize train data
vocabulary_size = 50000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(x_train)

In [14]:
# We will pad the sequences to the max length of the sentence.
# Later on, we will try with limiting the padding to max_len of 50
train_seq = tokenizer.texts_to_sequences(x_train)
train_data = pad_sequences(train_seq)

In [15]:
max_seq_len = train_data.shape[1]
print(max_seq_len)

271


In [16]:
# Repeat the same for dev data
dev_seq = tokenizer.texts_to_sequences(x_dev)
dev_data = pad_sequences(dev_seq, maxlen=max_seq_len)
dev_data.shape

(148409, 271)

In [None]:
# Apply simple CNN based binary classification
seed(2017)
conv = Sequential()
conv.add(Embedding(vocabulary_size, 100, input_length=max_seq_len))
conv.add(Conv1D(64, 5, activation = 'relu'))
conv.add(MaxPooling1D(2))
conv.add(Flatten())
conv.add(Dense(1, activation = 'sigmoid'))
sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
conv.compile(loss = 'binary_crossentropy', optimizer = sgd, metrics = ['accuracy'])
conv.fit(train_data, y_train, batch_size = 500, epochs = 8, verbose = 0)

In [None]:
pred = conv.predict(dev_data)
pred_labels = (pred > 0.5).astype(np.int)
pred_labels = np.reshape(pred_labels,[dev_data.shape[0],])

#print(y_dev[0:5])
#print(pred_labels[0:5])
accuracy = accuracy_score(y_dev,pred_labels)
print("Accuracy: ",accuracy)

In [None]:
# Use GloVe embeddings instead of trained embedding layer
embeddings_index = dict()
f = open('/Users/kurapati/W266/data/glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
# Repeat the CNN layer, but with GloVe embedding
seed(2018)
conv_glove = Sequential()
conv_glove.add(Embedding(vocabulary_size, 300, input_length=50, weights=[embedding_matrix], trainable=False))
conv_glove.add(Conv1D(64, 5, activation = 'relu'))
conv_glove.add(MaxPooling1D(4))
conv_glove.add(Flatten())
conv_glove.add(Dense(1, activation = 'sigmoid'))
sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
conv_glove.compile(loss = 'binary_crossentropy', optimizer = sgd, metrics = ['accuracy'])
conv_glove.fit(data, y_train, batch_size = 500, epochs = 8, verbose = 0)

In [None]:
pred = conv_glove.predict(dev_data)
pred_labels = (pred > 0.5).astype(np.int)
pred_labels = np.reshape(pred_labels,[148409,])

#print(y_dev[0:5])
#print(pred_labels[0:5])
accuracy = accuracy_score(y_dev,pred_labels)
print("Accuracy: ",accuracy)

In [None]:
# With CNN and LSTM
seed(2018)
conv_glove = Sequential()
conv_glove.add(Embedding(vocabulary_size, 300, input_length=50, weights=[embedding_matrix], trainable=False))
conv_glove.add(Conv1D(64, 5, activation = 'relu'))
conv_glove.add(MaxPooling1D(4))
conv_glove.add(LSTM(100))
conv_glove.add(Dense(1, activation = 'sigmoid'))
sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
conv_glove.compile(loss = 'binary_crossentropy', optimizer = sgd, metrics = ['accuracy'])
conv_glove.fit(data, y_train, batch_size = 500, epochs = 8, verbose = 0)

In [None]:
pred = conv_glove.predict(dev_data)
pred_labels = (pred > 0.5).astype(np.int)
pred_labels = np.reshape(pred_labels,[148409,])

#print(y_dev[0:5])
#print(pred_labels[0:5])
accuracy = accuracy_score(y_dev,pred_labels)
print("Accuracy: ",accuracy)

In [None]:
# F1 score
cnn_glove_f1 = f1_score(y_dev,pred_labels)
print("F1: ",cnn_glove_f1)

In [None]:
average_precision = average_precision_score(y_dev,pred_labels)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))