In [3]:
import os
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,Conv2D,MaxPooling2D
from keras.layers import Input, Embedding, Add
from keras import layers
from keras import layers
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.utils import to_categorical
# NLTK
import nltk
from gensim.models import FastText
from gensim.models import Word2Vec
# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn import svm
from keras.layers.merge import concatenate
from keras.layers import Bidirectional
from keras.layers import K, Activation
from keras.engine import Layer
from keras.layers import Dense, Input, Embedding, Dropout, Bidirectional, GRU, Flatten, SpatialDropout1D

In [4]:
gru_len = 128
Routings = 5
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.25
rate_drop_dense = 0.28

def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale


# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)




In [5]:
df = pd.read_excel('augmented.xlsx')
devdf = pd.read_excel('pp_dev.xlsx')
df = df.dropna()
df = df.drop_duplicates(subset = ['text'],keep = False)
devdf = devdf.dropna()
print(df.info())
print(devdf.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39512 entries, 0 to 11998
Data columns (total 2 columns):
text         39512 non-null object
Agr_Class    39512 non-null object
dtypes: object(2)
memory usage: 926.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2938 entries, 0 to 3000
Data columns (total 3 columns):
ID           2938 non-null object
text         2938 non-null object
Agr_Class    2938 non-null object
dtypes: object(3)
memory usage: 91.8+ KB
None


In [6]:
codemixed_embeddings = FastText.load('augmented_ft.bin')
print(codemixed_embeddings )
embeddings_index = dict()
f = open('/home/parth/research/english/new_version/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

FastText(vocab=24411, size=100, alpha=0.025)
Loaded 400000 word vectors.


In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=150)
print(data.shape)
sequences_d = tokenizer.texts_to_sequences(devdf['text'])
data_dev = pad_sequences(sequences_d, maxlen=150)
print(data_dev.shape)
vocabulary_size = len(tokenizer.word_index) + 1 # 20648

(39512, 150)
(2938, 150)


In [8]:
# create a weight matrix for words in training docs# create 
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
        else:
            embedding_vector = codemixed_embeddings[word]
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector

  # This is added back by InteractiveShellApp.init_path()


In [9]:
y= df['Agr_Class'].astype('category').cat.codes
y_dev = devdf['Agr_Class'].astype('category').cat.codes

In [None]:
def define_model(length, vocabulary_size,embedding_matrix):
    input1 = Input(shape=(length,))
    embedding1 = Embedding(vocabulary_size,100, weights=[embedding_matrix], trainable=False)(input1)
    conv1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding1)
    conv1 = Dropout(0.5)(conv1)
    capsule1 = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=3,share_weights=True)(conv1)
    flat1 = Flatten()(capsule1)

    input2 = Input(shape=(length,))
    embedding2 = Embedding(vocabulary_size,100, weights=[embedding_matrix], trainable=False)(input2)
    conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
    conv2 = Dropout(0.5)(conv2)    
    capsule2 = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=3,share_weights=True)(conv2)
    flat2 = Flatten()(capsule2)    

    input3 = Input(shape=(length,))
    embedding3 = Embedding(vocabulary_size,100, weights=[embedding_matrix], trainable=False)(input3)
    conv3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding3)   
    conv3 = Dropout(0.5)(conv3)    
    capsule3 = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=3,share_weights=True)(conv3)
    flat3 = Flatten()(capsule3)        

    merged = concatenate([flat1, flat2, flat3])
    
    dense = Dense(32,activation='relu')(merged)
    output = Dense(3, activation='softmax')(dense)
    model = Model(inputs=[input1,input2,input3], outputs=output)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [None]:
model = define_model(150, vocabulary_size,embedding_matrix)
maxacc =0
for i in range(1,21):
    model.fit([data,data,data], y = to_categorical(y), epochs=1, verbose=1)
    acc = model.evaluate(x= [data_dev,data_dev,data_dev], y=to_categorical(y_dev))[1]
    pred = model.predict([data_dev,data_dev,data_dev])
    prd = np.argmax(pred,axis = 1)
    print(confusion_matrix(y_dev,prd))
    print(classification_report(y_dev,prd))
    print("Accuracy : ",accuracy_score(prd,y_dev))
    if(acc > maxacc):
        print(acc,i)
        maxacc = acc
        model.save_weights('aug_caps_weights.h5')        
        model.save('aug_caps.h5')


In [None]:
def define_model(length, vocabulary_size,embedding_matrix):
    input1 = Input(shape=(length,))
    embedding1 = Embedding(vocabulary_size,100, weights=[embedding_matrix], trainable=False)(input1)
    conv1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding1)
    conv1 = Dropout(0.5)(conv1)
    capsule1 = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=3,share_weights=True)(conv1)
    #flat1 = Flatten()(capsule1)
    flat1 = Bidirectional(LSTM(300))(capsule1)

    input2 = Input(shape=(length,))
    embedding2 = Embedding(vocabulary_size,100, weights=[embedding_matrix], trainable=False)(input2)
    conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
    conv2 = Dropout(0.5)(conv2)    
    capsule2 = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=3,share_weights=True)(conv2)
    #flat2 = Flatten()(capsule2)    
    flat2 = Bidirectional(LSTM(300))(capsule2)
    
    input3 = Input(shape=(length,))
    embedding3 = Embedding(vocabulary_size,100, weights=[embedding_matrix], trainable=False)(input3)
    conv3 = Conv1D(filters=32, kernel_size=5, activation='relu')(embedding3)   
    conv3 = Dropout(0.5)(conv3)    
    capsule3 = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=3,share_weights=True)(conv3)
    #flat3 = Flatten()(capsule3)        
    flat3 = Bidirectional(LSTM(300))(capsule3)
    merged = concatenate([flat1, flat2, flat3])
    
    dense = Dense(32,activation='relu')(merged)
    output = Dense(3, activation='softmax')(dense)
    model = Model(inputs=[input1,input2,input3], outputs=output)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [None]:
model = define_model(150, vocabulary_size,embedding_matrix)
maxacc =.50
for i in range(1,21):
    model.fit([data,data,data], y = to_categorical(y), epochs=1, verbose=1)
    acc = model.evaluate(x= [data_dev,data_dev,data_dev], y=to_categorical(y_dev))[1]
    print('val: ', acc)
    pred = model.predict([data_dev,data_dev,data_dev])
    prd = np.argmax(pred,axis = 1)
    print(confusion_matrix(y_dev,prd))
    print(classification_report(y_dev,prd))
    if(acc > maxacc):
        print(acc,i)
        maxacc = acc
        model.save('reproduce_caps_bi.h5')
        model.save_weights('reproduce_caps_bi.h5')