In [1]:
import numpy as np 
import pandas as pd
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import BatchNormalization, add, merge, Concatenate, Dropout, Conv1D, MaxPooling1D, LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Flatten, Dropout
from sklearn.model_selection import train_test_split
from keras.metrics import categorical_accuracy
from keras import backend as K
from keras.regularizers import l2
import tensorflow as tf


Using TensorFlow backend.


In [33]:
import pandas as pd
import io

from google.colab import drive 
drive.mount('/content/gdrive')

import pandas as pd 
df=pd.read_csv('gdrive/My Drive/train.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# The custom accuracy metric used for this task
def accuracy(y_true, y_pred):
    y = tf.argmax(y_true, axis =- 1)
    y_ = tf.argmax(y_pred, axis =- 1)
    mask = tf.greater(y, 0)
    return K.cast(K.equal(tf.boolean_mask(y, mask), tf.boolean_mask(y_, mask)), K.floatx())

# Maps the sequence to a one-hot encoding
def onehot_to_seq(oh_seq, index):
    s = ''           
    for o in oh_seq:
        i = np.argmax(o)
        if i != 0:
            s += index[i]
        else:
            break
    return s

# prints the results
def print_results(x, y_, revsere_decoder_index):
    # print("input     : " + str(x))
    # print("prediction: " + str(onehot_to_seq(y_, revsere_decoder_index).upper()))
    print(str(onehot_to_seq(y_, revsere_decoder_index).upper()))

# Computes and returns the n-grams of a particualr sequence, defaults to trigrams
def seq2ngrams(seqs, n = 3):
    return np.array([[seq[i : i + n] for i in range(len(seq))] for seq in seqs])

In [0]:
train_df = pd.read_csv('gdrive/My Drive/train.csv')
test_df = pd.read_csv('gdrive/My Drive/test.csv')

In [0]:
maxlen_seq = 700

# Loading and converting the inputs to trigrams
train_input_seqs, train_target_seqs = train_df[['input', 'expected']][(train_df.len <= maxlen_seq)].values.T
train_input_grams = seq2ngrams(train_input_seqs,1) #change from 3 to 2

# Same for test
test_input_seqs = test_df['input'].values.T
test_input_grams = seq2ngrams(test_input_seqs,1) #change from 3 to 2

# Initializing and defining the tokenizer encoders and decoders based on the train set
tokenizer_encoder = Tokenizer()
tokenizer_encoder.fit_on_texts(train_input_grams)
tokenizer_decoder = Tokenizer(char_level = True)
tokenizer_decoder.fit_on_texts(train_target_seqs)

# Using the tokenizer to encode and decode the sequences for use in training
# Inputs
train_input_data = tokenizer_encoder.texts_to_sequences(train_input_grams)
train_input_data = sequence.pad_sequences(train_input_data, maxlen = maxlen_seq, padding = 'post')

# Targets
train_target_data = tokenizer_decoder.texts_to_sequences(train_target_seqs)
train_target_data = sequence.pad_sequences(train_target_data, maxlen = maxlen_seq, padding = 'post')
train_target_data = to_categorical(train_target_data)

# Use the same tokenizer defined on train for tokenization of test
test_input_data = tokenizer_encoder.texts_to_sequences(test_input_grams)
test_input_data = sequence.pad_sequences(test_input_data, maxlen = maxlen_seq, padding = 'post')

# Computing the number of words and number of tags to be passed as parameters to the keras model
n_words = len(tokenizer_encoder.word_index) + 1
n_tags = len(tokenizer_decoder.word_index) + 1

# Model 

In [6]:
input = Input(shape = (maxlen_seq,)) 

x = Embedding(input_dim = n_words, output_dim = 128, input_length = maxlen_seq)(input)

#dropout the outpout
x=Dropout(0.1)(x)

#from the embeding
d = Bidirectional(LSTM(units = 128, return_sequences = True, recurrent_dropout = 0.1))(x)

#from the simplest model
A = Conv1D(64, kernel_size=11, strides=1, padding='same', activation='relu', kernel_regularizer=l2(0.001))(x)
A = MaxPooling1D(pool_size=2, strides=1, padding='same')(A)
A= BatchNormalization()(A)

#dropout the outpout
A= Dropout(0.1)(A)

B = Conv1D(64, kernel_size=7, strides=1, padding='same', activation='relu', kernel_regularizer=l2(0.001))(A)
B = MaxPooling1D(pool_size=2, strides=1, padding='same')(B)
B= BatchNormalization()(B)

#dropout the outpout
B= Dropout(0.1)(B)

C = Conv1D(64, kernel_size=3, strides=1, padding='same', activation='relu', kernel_regularizer=l2(0.001))(B)
C = MaxPooling1D(pool_size=2, strides=1, padding='same')(C)
C= BatchNormalization()(C)

#dropout the outpout
C=Dropout(0.1)(C)

f=Bidirectional(LSTM(units = 128, return_sequences = True, recurrent_dropout = 0.1))(C)


#from the complex model
a = Conv1D(64, kernel_size=3, strides=1, padding='same', activation='relu', kernel_regularizer=l2(0.001))(x)
a = MaxPooling1D(pool_size=2, strides=1, padding='same')(a)
a= BatchNormalization()(a)

b = Conv1D(64, kernel_size=7, strides=1, padding='same', activation='relu', kernel_regularizer=l2(0.001))(x)
b = MaxPooling1D(pool_size=2, strides=1, padding='same')(b)
b= BatchNormalization()(b)

c = Conv1D(64, kernel_size=11, strides=1, padding='same', activation='relu', kernel_regularizer=l2(0.001))(x)
c = MaxPooling1D(pool_size=2, strides=1, padding='same')(c)
c= BatchNormalization()(c)


e = add([a,b,c])

e= Dropout(0.1)(e)
e = Bidirectional(LSTM(units = 128, return_sequences = True, recurrent_dropout = 0.1))(e)


#dropout the outpouts
d= Dropout(0.1)(d)

e= Dropout(0.1)(e)

f= Dropout(0.1)(f)

g = add([d,e,f])
h = Bidirectional(LSTM(units = 128, return_sequences = True, recurrent_dropout = 0.1))(g)

h= Dropout (0.25)(h)
i = TimeDistributed(Dense((256), activation='relu', kernel_regularizer=l2(0.001)))(h)

output = TimeDistributed(Dense(n_tags, activation = "softmax"))(i)

model = Model(inputs=[input], outputs=[output])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 700)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 700, 128)     2816        input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 700, 128)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 700, 64)      90176       dropout_1[0][0]                  
__________________________________________________________________________________________________
max_poolin

In [0]:
# Setting up the model with categorical x-entropy loss and the custom accuracy function as accuracy
model.compile(optimizer = "nadam", loss = "categorical_crossentropy", metrics = ["accuracy", accuracy])

# Training

In [11]:
filepath="weights_model3_128_LSTM_1gram.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
early_stopping_monitor = EarlyStopping(patience=10)

# Splitting the data for train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_input_data, train_target_data, test_size = .1, random_state = 0)

# Training the model on the training data and validating using the validation set
model.fit(X_train,  y_train, batch_size = 210, epochs = 50, validation_data = (X_val, y_val), callbacks=callbacks_list, verbose = 1) 


Train on 5072 samples, validate on 564 samples
Epoch 1/50

Epoch 00001: val_accuracy improved from -inf to 0.52239, saving model to weights_model3_128_LSTM_1gram.best.hdf5
Epoch 2/50

Epoch 00002: val_accuracy improved from 0.52239 to 0.59697, saving model to weights_model3_128_LSTM_1gram.best.hdf5
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.59697
Epoch 4/50

Epoch 00004: val_accuracy did not improve from 0.59697
Epoch 5/50

Epoch 00005: val_accuracy did not improve from 0.59697
Epoch 6/50

Epoch 00006: val_accuracy did not improve from 0.59697
Epoch 7/50

Epoch 00007: val_accuracy did not improve from 0.59697
Epoch 8/50

Epoch 00008: val_accuracy did not improve from 0.59697
Epoch 9/50

Epoch 00009: val_accuracy did not improve from 0.59697
Epoch 10/50

Epoch 00010: val_accuracy did not improve from 0.59697
Epoch 11/50

Epoch 00011: val_accuracy did not improve from 0.59697
Epoch 12/50

Epoch 00012: val_accuracy improved from 0.59697 to 0.59743, saving model to weight

<keras.callbacks.History at 0x7f742450f668>

In [0]:
model.compile(optimizer = "nadam", loss = "categorical_crossentropy", metrics = ["accuracy", accuracy])


filepath="weights_model_128_LSTM_1gram.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
early_stopping_monitor = EarlyStopping(patience=10)

# Splitting the data for train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_input_data, train_target_data, test_size = .1, random_state = 0)
model.fit(X_train,  y_train, batch_size = 210, epochs = 50, validation_data = (X_val, y_val), callbacks=callbacks_list, verbose = 1) 

files.download("weights_model_128_LSTM_1gram.best.hdf5")

Train on 5072 samples, validate on 564 samples
Epoch 1/50

Epoch 00001: val_accuracy improved from -inf to 0.58211, saving model to weights_model_128_LSTM_1gram.best.hdf5
Epoch 2/50

Epoch 00002: val_accuracy did not improve from 0.58211
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.58211
Epoch 4/50

Epoch 00004: val_accuracy did not improve from 0.58211
Epoch 5/50

Epoch 00005: val_accuracy did not improve from 0.58211
Epoch 6/50

Epoch 00006: val_accuracy improved from 0.58211 to 0.58832, saving model to weights_model_128_LSTM_1gram.best.hdf5
Epoch 7/50

Epoch 00007: val_accuracy did not improve from 0.58832
Epoch 8/50

Epoch 00008: val_accuracy did not improve from 0.58832
Epoch 9/50

Epoch 00009: val_accuracy did not improve from 0.58832
Epoch 10/50

Epoch 00010: val_accuracy did not improve from 0.58832
Epoch 11/50

Epoch 00011: val_accuracy improved from 0.58832 to 0.58896, saving model to weights_model_128_LSTM_1gram.best.hdf5
Epoch 12/50

Epoch 00012: val_accurac

In [11]:
from google.colab import files

model.compile(optimizer = "nadam", loss = "categorical_crossentropy", metrics = ["accuracy", accuracy])


filepath="weights_model3_128_LSTM_1gram.best_one.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
early_stopping_monitor = EarlyStopping(patience=10)

# Splitting the data for train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_input_data, train_target_data, test_size = .1, random_state = 0)
model.fit(X_train,  y_train, batch_size = 210, epochs = 50, validation_data = (X_val, y_val), callbacks=callbacks_list, verbose = 1) 

files.download("weights_model3_128_LSTM_1gram.best_one.hdf5")

Train on 5072 samples, validate on 564 samples
Epoch 1/50

Epoch 00001: val_accuracy improved from -inf to 0.57903, saving model to weights_model3_128_LSTM_1gram.best_one.hdf5
Epoch 2/50

Epoch 00002: val_accuracy did not improve from 0.57903
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.57903
Epoch 4/50

Epoch 00004: val_accuracy improved from 0.57903 to 0.58302, saving model to weights_model3_128_LSTM_1gram.best_one.hdf5
Epoch 5/50

Epoch 00005: val_accuracy improved from 0.58302 to 0.59244, saving model to weights_model3_128_LSTM_1gram.best_one.hdf5
Epoch 6/50

Epoch 00006: val_accuracy did not improve from 0.59244
Epoch 7/50

Epoch 00007: val_accuracy did not improve from 0.59244
Epoch 8/50

Epoch 00008: val_accuracy did not improve from 0.59244
Epoch 9/50

Epoch 00009: val_accuracy did not improve from 0.59244
Epoch 10/50

Epoch 00010: val_accuracy did not improve from 0.59244
Epoch 11/50

Epoch 00011: val_accuracy did not improve from 0.59244
Epoch 12/50

Epoch 000

MessageError: ignored

# Update to the best model

In [0]:
model.load_weights("gdrive/My Drive/weights_model_128_LSTM_1gram.final.hdf5")

# Output the result 

In [0]:
# Defining the decoders so that we can
revsere_decoder_index = {value:key for key,value in tokenizer_decoder.word_index.items()}
revsere_encoder_index = {value:key for key,value in tokenizer_encoder.word_index.items()}

In [25]:
y_test_pred = model.predict(test_input_data[:])
print(len(test_input_data))
for i in range(len(test_input_data)):
    print_results(test_input_seqs[i], y_test_pred[i], revsere_decoder_index)

kaggle_csv(model,test_input_data,test_df)

119
----EEEEEE-GGG-TEEE-TTSEEEEEEEETTTEEEEEETTTEEEEE-TT-TT----EEEEHHHHHHHHHTT--HHHHHHHH-TTSEEEEEETT---S-HHHHHEEETTEEEETTEEEEEEHHHH-HEEEETTTEEEEEEE-HHHHHHHHHHHHHHHTT---HHHHHHHHHHHHHHHHHHHHHHHHH-
-EEEEEEEE--TTHHHHHHHHHHHHHHHHHHHHHTT-EEEEEEEEETTTEEEEEE-TT-EEEEEE-SHHHHHHHHHHSHHHHHHHHHHTT-EEEEEEEEE---
-EEEEETTT-EEEEEE-TTT----SEEEEE-HHHHHHHH----GGEEEEEES----EEEEEEEEEEEEEEEEEE-----EEEEEEEEETEEEETT-EEEEEEEEEEEGEEEEEEEEEETTTEEEEEEEEETTEEE-
--EEEEE--HHHHHHHHHTT--EEEEEE--GTTT--HHHHHHHHHHHH----EE--EEE---TTT--HHHHHHHHHHHHHHHHHT--EEEEEEEE--STT-HHHHHHHHHHHSSEEEEEE--TTTTS-HHHHHHHHHHTT-EEEEEES---THHHHHHHHHHHHHHTTS-EEEEEETT-TTHHHHHHHTT-EEEEE-----------G----EEEEEHHHHHHHHHHHHHT----EEEEE-SSHHHHHHHHHTT-SEEEEE---TTTT--HHHHHHHHHHHH--------EEE---TTT--HHHHHHHHHHHHHHHHTT--EEEEEEEE--S-T-HHHHHHHHHHHSSEEEEEE--TTTTS-HHHHHHHHHHTT-EEEEEES---THHHHHHHHHHHHHHTTS-EEEEEETT-TTHHHHHHHTT-EEEEE-----------GGGHHHH---HHHHHHHHHHHHHHHHH-
-EEEEEEETHHHHHHHHHHHHTT-EEEEEESSS-HHHHHHHHTT---TTTEEEETTEEEE-HHHHHHHHTT-SEEEEEE-TTTTHHHHHHHGGGH

# Accuracy Checking on the entire data set: after 47 epochs

In [26]:
model.metrics_names

['loss', 'acc', 'accuracy']

In [37]:
model.evaluate(X_train, y_train, batch_size=512)



[0.3048409574227378, 0.8947848136116654, 0.753533956004242]

In [38]:
model.evaluate(X_val, y_val, batch_size=512)



[0.588876522179191, 0.8264437508075795, 0.5948492833908569]

In [39]:
model.evaluate(train_input_data, train_target_data, batch_size=512)



[0.3332646773418186, 0.8879458543423476, 0.737693671472873]

# Generate the cvs file

In [0]:
def give_results(x, y_, revsere_decoder_index):
    # print("input     : " + str(x))
    # print("prediction: " + str(onehot_to_seq(y_, revsere_decoder_index).upper()))
    return str(onehot_to_seq(y_, revsere_decoder_index).upper())

In [0]:
def kaggle_csv(model,test_input_data,test_df):
  y_test_pred = model.predict(test_input_data[:])
  df = pd.DataFrame(columns=['id','expected'])
  df['id'] = test_df['id']
  expecteds = []
  for i in range(len(test_input_data)):
    expecteds.append(give_results(test_input_seqs[i], y_test_pred[i], revsere_decoder_index))
  df['expected'] = expecteds
  df.to_csv('kaggle_results_model_end.csv', header=True, index=False)
  return

In [0]:
kaggle_csv(model,test_input_data,test_df)

# Dowload the model

In [31]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a file.
uploaded = drive.CreateFile({'title': 'kaggle_results_model_end.csv'})
uploaded.SetContentFile('kaggle_results_model_end.csv')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 1P7UnhQGtaoUOkjD7bprPlRH9CVDTZbwB


# Draw the model

In [0]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')