**Importing Libraries**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
from keras.layers import Embedding
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.callbacks import CSVLogger
from keras import metrics

#FOR TA TO LOAD MODEL :)
from keras.models import load_model

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

Using TensorFlow backend.


Declaring performance evaluation metrics

Reading Google WordtoVec Model. It is 1.5 GB and takes some time (about 5 minutes)

In [2]:
word_vectors = KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


**config Cell**

In [0]:
conf = {
    'EMB_DIM':300,
    'WordsTake':20000,
    'activationType':'relu',
    'epochNum':200
}

**Loading train.tsv**
Loading the entire dataset into rawdata variable.
Drop the first two columns such as ids and sentimentid which aren't sueful for our model

In [21]:
rawdata = pd.read_csv('/content/drive/My Drive/Desktop/NLP Assignment 2/train.tsv', sep='\t', header=0)

dataset = pd.DataFrame(rawdata,columns=['Phrase','Sentiment'])

dataset.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


Spliting the dataset into x and y that are phrase and sentiments.
Using sklearn to split the data into traning and testing sets.

In [0]:
dataset = dataset.dropna()

x = pd.DataFrame(rawdata,columns=['Phrase'])
y = pd.DataFrame(rawdata,columns=['Sentiment'])

Processing the dataset. Vercotization and removal of stop words

In [0]:
polarities=dataset.Sentiment.unique()
# dic={'very negative','negative','neutral','positive','very positive'}
dic={}
for i,polarity in enumerate(polarities):
    dic[polarity]=i
labels=dataset.Sentiment.apply(lambda x:dic[x])

Splliting the dataset using sklearn.model_selection

In [0]:
#Spliting
_x_train, _x_test, _y_train, _y_test = train_test_split(x, y, test_size=0.3, random_state=2003)

In [8]:

tokenizer = Tokenizer(num_words=conf['WordsTake'],filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(_x_train.Phrase)

sequences_train = tokenizer.texts_to_sequences(_x_train.Phrase)
sequences_valid=tokenizer.texts_to_sequences(_x_test.Phrase)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 15252 unique tokens.


In [9]:
X_train = pad_sequences(sequences_train)
X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
y_train = to_categorical(np.asarray(labels[_x_train.index]))
y_val = to_categorical(np.asarray(labels[_x_test.index]))
print('Shape of X train and X validation tensor:', X_train.shape,X_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

Shape of X train and X validation tensor: (109242, 48) (46818, 48)
Shape of label train and validation tensor: (109242, 5) (46818, 5)


In [10]:

vocabulary_size=min(len(word_index)+1,conf['WordsTake'])
embedding_matrix = np.zeros((vocabulary_size, conf['EMB_DIM']))
for word, i in word_index.items():
    if i>=conf['WordsTake']:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),conf['EMB_DIM'])

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            conf['EMB_DIM'],
                            weights=[embedding_matrix],
                            trainable=True)




In [0]:
vocabulary_size=min(len(word_index)+1,conf['WordsTake'])

embedding_layer = Embedding(vocabulary_size,
                            conf['EMB_DIM'])

In [0]:
def cnnModel(act):
  sequence_length = X_train.shape[1]
  filter_sizes = [3,4,5]
  num_filters = 100
  drop = 0.5

  inputs = Input(shape=(sequence_length,))
  embedding = embedding_layer(inputs)
  reshape = Reshape((sequence_length,conf['EMB_DIM'],1))(embedding)

  conv_0 = Conv2D(num_filters, (filter_sizes[0], conf['EMB_DIM']),activation=act,kernel_regularizer=regularizers.l2(0.01))(reshape)
  conv_1 = Conv2D(num_filters, (filter_sizes[1], conf['EMB_DIM']),activation=act,kernel_regularizer=regularizers.l2(0.01))(reshape)
  conv_2 = Conv2D(num_filters, (filter_sizes[2], conf['EMB_DIM']),activation=act,kernel_regularizer=regularizers.l2(0.01))(reshape)


  maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
  maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
  maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

  merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
  # merged_tensor = concatenate([maxpool_0, maxpool_1], axis=1)
  flatten = Flatten()(merged_tensor)
  reshape = Reshape((3*num_filters,))(flatten)
  dropout = Dropout(drop)(flatten)
  output = Dense(units=5, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
  # this creates a model that includes
  return Model(inputs, output)

In [13]:
model = cnnModel(conf['activationType'])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [14]:
def recall_m(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = tp / (possible_positives + K.epsilon())
    return recall

def p_m(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = tp / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = p_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc',f1_m,p_m, recall_m])





Traning the Model. Please check training.log for results

In [23]:
csv_logger = CSVLogger('training.log')

dependencies = {
     'f1_m': f1_m,
     'p_m': p_m,
     'recall_m': recall_m,
}

model.fit(X_train, y_train, nb_epoch=2, batch_size=900, verbose=1, callbacks=[csv_logger], shuffle=True)

model.save('my_model.h5') 

  # Remove the CWD from sys.path while we load stuff.


Epoch 1/2
Epoch 2/2


In [24]:
# evaluate the model
loss, accuracy, f1_score, precision, recall = model.evaluate(X_val, y_val, verbose=1)
print('accuracy',accuracy)
print('f1_score',f1_score)
print('precision',precision)
print('recall',recall)

accuracy 0.6439830834294502
f1_score 0.6406792494975607
precision 0.653772721210535
recall 0.6285189457046435
