In [1]:
from __future__ import print_function
import re
import string
import collections
import math
import numpy as np
import os
import nltk
import random
from nltk.tokenize import RegexpTokenizer
import json
import pandas as pd

from keras.preprocessing import sequence
from keras.constraints import maxnorm
from keras.models import model_from_json
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten, Activation, Merge, Highway
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
import keras.optimizers
from keras.regularizers import l2, l1
from sklearn.cross_validation import train_test_split, KFold

Using Theano backend.
Using gpu device 0: GeForce 840M (CNMeM is disabled, cuDNN Version is too old. Update to v5, was 2000.)


### Loading pretrained character embeddings

In [2]:
header = ['Char']
for i in range(64):
    header.append('X' + str(i+1))

In [4]:
embeddings = pd.read_csv('char_embeddings_d150_tr1e6_w2_softmax_adagrad_spaces.csv', names=header)
embeddings_dictionary = {}

In [8]:
for i in xrange(len(embeddings)):
    vec = []
    for j in xrange (64):
        vec += [embeddings['X' + str(j+1)][i]]
    embeddings_dictionary[unicode(embeddings['Char'][i], 'utf8')] = vec
embeddings_dictionary[' '] = embeddings_dictionary['_']

class Embeddings_Reader(dict):
         def __missing__(self, key):
            return embeddings_dictionary[u'UNK']
        
embeddings_lookup = Embeddings_Reader(embeddings_dictionary)

### Loading training data and extracting features and labels

In [9]:
def stops(char):
    stop = "\?\!\."
    m = re.search(r'^[{0}]$'.format(stop), char)
    return m != None

In [10]:
yandex_corpus = pd.read_csv('./1mcorpus/corpus.en_ru.1m.ru' , sep='##%##', names = ['sentence'])

  if __name__ == '__main__':


In [13]:
first_sentences = list(yandex_corpus['sentence'])
stops_data = collections.deque([])
pointer = 0
radius = 7
window_size = 2*radius+1
sliding_window = collections.deque([], maxlen = window_size)
dot_features = []

for i in xrange(len(first_sentences)):
    
    initial_pointer = 0    
    sentence = [' '] + list(unicode (first_sentences[i], 'utf8'))    
    
    if len(sliding_window) < window_size:
        for charnum in range(len(sentence)):
            if (charnum == len(sentence) - 1) & stops(sentence[charnum]):
                sliding_window.append(sentence[charnum] + u'#')
            else:
                sliding_window.append(sentence[charnum])
            pointer += 1
            initial_pointer += 1
            if pointer == window_size:
                break
    
    if pointer < window_size:
        continue
    
    for charnum in range (initial_pointer, len(sentence)):
        if stops(sliding_window[radius][0]):                        
            dot_features = list(sliding_window)[:radius] + list(sliding_window)[-radius:]
            if (len (sliding_window[radius]) == 2):
                label = 0
            else:                
                label = 1
            vec_features = map (lambda x: embeddings_lookup[x[0]], dot_features)            
            stops_data.append((label, vec_features))
        if (charnum == len(sentence) - 1) & stops(sentence[charnum]):
            sliding_window.append(sentence[charnum] + u'#')
        else:
            sliding_window.append(sentence[charnum])    
    if i % 100000 == 0:        
        print('Iteration %d : Length of the data set is %d' % (i, len(stops_data)))        

Iteration 0 : Length of the data set is 0
Iteration 100000 : Length of the data set is 107473
Iteration 200000 : Length of the data set is 215137
Iteration 300000 : Length of the data set is 322575
Iteration 400000 : Length of the data set is 433742
Iteration 500000 : Length of the data set is 557080
Iteration 600000 : Length of the data set is 674213
Iteration 700000 : Length of the data set is 801662
Iteration 800000 : Length of the data set is 914944
Iteration 900000 : Length of the data set is 1029297


In [55]:
#Number of nonbreaking stop characters in the dataset
counter = 0
for i in range (len(stops_data)):
    if stops_data[i][0] == 1:
        counter +=1
print (counter)

239398


### Initial Validation run with random train/tast split

In [31]:
labels, features = zip (*stops_data)

In [38]:
data_train, data_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.10, random_state=42)

In [42]:
X_train = np.array(data_train, dtype='float32')
X_test = np.array(data_test, dtype='float32')

y_train = np.array(labels_train)
y_test = np.array(labels_test)

In [52]:
model = Sequential()
model.add(Flatten(input_shape = X_train[0].shape))
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dense(10))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [53]:
batch_size = 100
stop = keras.callbacks.EarlyStopping(monitor='val_acc', patience=5, verbose=0, mode='auto')
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [54]:
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=200, callbacks= [stop], shuffle=True,
          validation_data=(X_test, y_test))

score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print ('\n')
print('Validation score :', score)
print('Validation accuracy :', acc)

Train on 1028512 samples, validate on 114280 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200


Validation score : 0.0528505300126
Validation accuracy : 0.981484084664


early stop monitored value should be changed to the validation loss and the patience should be decreased

### 10-fold Cross-Validation

In [56]:
splitpoints = range (0,len(stops_data),len(stops_data)/10)

In [72]:
batches = []
stops_data = list (stops_data)
random.shuffle(stops_data)
i_prev = 0
for i in splitpoints[1:]:
    batches.append (stops_data[i_prev:i])
    i_prev = i

In [89]:
validation_training = []
validation_test = []
indices = range (len(batches))
for i in indices:
    test = batches[i]
    validation_test.append(test)
    
    training = []
    training_indices = list (indices)
    training_indices.remove(i)
    
    for j in training_indices:        
        training += batches[j]
    
    validation_training.append(training)

In [96]:
cv_results = []
for i in range (len(validation_test)):
    test_data = validation_test[i]
    train_data = validation_training[i]
    
    print ('Training step:', i, '...' )    
    
    labels_train, features_train = zip (*train_data)
    labels_test, features_test = zip (*test_data)

    X_train = np.array(features_train, dtype='float32')
    X_test = np.array(features_test, dtype='float32')

    y_train = np.array(labels_train)
    y_test = np.array(labels_test)    
    
    stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
    batch_size = 100
    
    model = Sequential()
    model.add(Flatten(input_shape = X_train[0].shape))
    model.add(Dense(40))
    model.add(Activation('relu'))
    model.add(Dense(10))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=200, callbacks= [stop], shuffle=True,  verbose=0,
              validation_data=(X_test, y_test))
    
    print ('Done.')

    _, acc = model.evaluate(X_test, y_test, verbose=0, batch_size=batch_size)

    cv_results.append(acc)
    print ('Step', i, 'accuracy:', acc)
    print ('-----------------------------------\n')
    del model

cross_val = np.mean(cv_results)
print ('10-Fold Cross-Validation accuracy is:', cross_val)

Training step: 0 ...
Done.
Step 0 accuracy: 0.981728937187
-----------------------------------

Training step: 1 ...
Done.
Step 1 accuracy: 0.981466421936
-----------------------------------

Training step: 2 ...
Done.
Step 2 accuracy: 0.981947700092
-----------------------------------

Training step: 3 ...
Done.
Step 3 accuracy: 0.982726496423
-----------------------------------

Training step: 4 ...
Done.
Step 4 accuracy: 0.981501424507
-----------------------------------

Training step: 5 ...
Done.
Step 5 accuracy: 0.981562678376
-----------------------------------

Training step: 6 ...
Done.
Step 6 accuracy: 0.982280219482
-----------------------------------

Training step: 7 ...
Done.
Step 7 accuracy: 0.982096458585
-----------------------------------

Training step: 8 ...
Done.
Step 8 accuracy: 0.981396418083
-----------------------------------

Training step: 9 ...
Done.
Step 9 accuracy: 0.981720186765
-----------------------------------

10-Fold Cross-Validation accuracy is: 0.

### Final model training

In [97]:
random.shuffle(stops_data)
labels, features = zip (*stops_data)
data_train, data_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.01, random_state=42)
X_train = np.array(data_train, dtype='float32')
X_test = np.array(data_test, dtype='float32')

y_train = np.array(labels_train)
y_test = np.array(labels_test)

In [98]:
model = Sequential()
model.add(Flatten(input_shape = X_train[0].shape))
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dense(10))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [99]:
batch_size = 100
stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [100]:
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=200, callbacks= [stop], shuffle=True,
          validation_data=(X_test, y_test))

score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print ('\n')
print('Validation score :', score)
print('Validation accuracy :', acc)

Train on 1131364 samples, validate on 11428 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200


Validation score : 0.0473378441138
Validation accuracy : 0.982324126282


In [101]:
json_string = model.to_json()
name_ = './Models/Keras_boundary_nn_model_r7_l40_l10_l1'
model_name = name_ + '.json'
open (model_name, 'w').write(json_string)
weights_name = name_ + '_weights.h5'
model.save_weights(weights_name)

### Testing on the Opencorpora data

In [102]:
corpus = pd.read_csv('opencorpora.csv')

In [104]:
first_sentences = list(corpus['sentence'])
stops_opencorp_data = collections.deque([])
pointer = 0
radius = 7
window_size = 2*radius+1
sliding_window = collections.deque([], maxlen = window_size)
dot_features = []

for i in xrange(len(first_sentences)-1):
    
    initial_pointer = 0    
    sentence = [' '] + list(unicode (first_sentences[i], 'utf8'))    
    
    if len(sliding_window) < window_size:
        for charnum in range(len(sentence)):
            if (charnum == len(sentence) - 1) & stops(sentence[charnum]):
                sliding_window.append(sentence[charnum] + u'#')
            else:
                sliding_window.append(sentence[charnum])
            pointer += 1
            initial_pointer += 1
            if pointer == window_size:
                break
    
    if pointer < window_size:
        continue
    
    for charnum in range (initial_pointer, len(sentence)):
        if stops(sliding_window[radius][0]):
            dot_features = list(sliding_window)[:radius] + list(sliding_window)[-radius:]
            if (len (sliding_window[radius]) == 2):
                label = 0
            else:
                label = 1
            vec_features = map (lambda x: embeddings_lookup[x[0]], dot_features)                            
            stops_opencorp_data.append((label, vec_features))
        if (charnum == len(sentence) - 1) & stops(sentence[charnum]):
            sliding_window.append(sentence[charnum] + u'#')
        else:
            sliding_window.append(sentence[charnum])    
    if i % 10000 == 0:
        print('Iteration %d : Length of the data set is %d' % (i, len(stops_opencorp_data)))

Iteration 0 : Length of the data set is 0
Iteration 10000 : Length of the data set is 10148
Iteration 20000 : Length of the data set is 20753
Iteration 30000 : Length of the data set is 31276
Iteration 40000 : Length of the data set is 42022
Iteration 50000 : Length of the data set is 52769
Iteration 60000 : Length of the data set is 63280
Iteration 70000 : Length of the data set is 74780
Iteration 80000 : Length of the data set is 85107
Iteration 90000 : Length of the data set is 95527


In [106]:
counter = 0
for i in range (len(stops_opencorp_data)):
    if stops_opencorp_data[i][0] == 1:
        counter +=1
print (counter)

18750


In [107]:
labels_op, features_op = zip (*stops_opencorp_data)

In [108]:
X_test_op = np.array(features_op, dtype='float32')

y_test_op = np.array(labels_op)

In [111]:
score_op, acc_op = model.evaluate(X_test_op, y_test_op, batch_size=1000)
print ('\n')
print('Test score :', score_op)
print('Test accuracy :', acc_op)



Test score : 0.129229684986
Test accuracy : 0.961725388473


In [2]:
model = model_from_json(open('/home/mithfin/anaconda2/docs/Wikiproject/Models/Keras_boundary_nn_model_r7_l40_l10_l1.json').read())
model.load_weights('/home/mithfin/anaconda2/docs/Wikiproject/Models/Keras_boundary_nn_model_r7_l40_l10_l1_weights.h5')

In [3]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

14