# CNN Sentiment Classification 

### Data preprocessing 

In [3]:
import pandas as pd
import os
import re
import textblob
import tensorflow as tf
import numpy as np

We first import text files from train & test folders into a dataframe, on which data cleaning is done.

In [4]:
train_path = 'aclImdb/train/'
test_path = 'aclImdb/test/'

In [5]:
#import training data into a dataframe
indices = []
text = []
rating = []
i = 0
for filename in os.listdir(train_path+'pos'):
    data = open(train_path+'pos/'+filename, 'r' , encoding='ISO-8859-1').read()
    indices.append(i)
    text.append(data)
    rating.append('1')
    i = i + 1
for filename in os.listdir(train_path+'neg'):
    data = open(train_path+'neg/'+filename, 'r' , encoding='ISO-8859-1').read()
    indices.append(i)
    text.append(data)
    rating.append('0')
    i = i + 1
Dataset = list(zip(indices,text,rating))
df = pd.DataFrame(data = Dataset, columns=['row_Number', 'review', 'sentiment'])

#print(df.head())
print(df.shape)

(25000, 3)


In [6]:
#import test data
indices = []
text = []
rating = []
i = 0
for filename in os.listdir (test_path+'pos'):
    data = open(test_path+'pos/'+filename,'r',encoding='ISO-8859-1').read()
    indices.append(i)
    text.append(data)
    rating.append('1')
    i = i + 1
for filename in os.listdir(test_path+'neg'):
    data = open(test_path+'neg/'+filename, 'r' , encoding='ISO-8859-1').read()
    indices.append(i)
    text.append(data)
    rating.append('0')
    i = i + 1
Dataset = list(zip(indices,text,rating))
dftest = pd.DataFrame(data = Dataset, columns=['row_Number', 'review', 'sentiment'])

#print(dftest.head())

In [7]:
#data cleaning on training data --can be more concise

df['review'] = df['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['review'] = df['review'].map(lambda x: re.sub(r'([^\s\w]|_)+', '', x))
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
df['review'] = df['review'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
freq = pd.Series(' '.join(df['review']).split()).value_counts()[:10] #print(freq) to check the words
df['review'] = df['review'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
#freq = pd.Series(' '.join(df['review']).split()).value_counts()[-80:] since we are only looking at most freq ones, this is unnecessary
#df['review'] = df['review'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [8]:
#data cleaning on testing data

dftest['review'] = dftest['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
dftest['review'] = dftest['review'].map(lambda x: re.sub(r'([^\s\w]|_)+', '', x))
from nltk.corpus import stopwords
stop = stopwords.words('english')
dftest['review'] = dftest['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
dftest['review'] = dftest['review'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
freq = pd.Series(' '.join(dftest['review']).split()).value_counts()[:10] #print(freq) to check the words
dftest['review'] = dftest['review'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
#freq = pd.Series(' '.join(dftest['review']).split()).value_counts()[-80:] 
#dftest['review'] = dftest['review'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

After cleaning, we will create corresponding data for train, validation (will be denoted as 'val') and test. 

In [9]:
#split df into train and validation data
x = df.review
y = df.sentiment

from sklearn.cross_validation import train_test_split
SEED = 2000
x_train,x_val_and_test,y_train,y_val_and_test = train_test_split(x,y,train_size=0.9,random_state=SEED)
x_val,x_test,y_val,y_test = train_test_split(x_val_and_test,y_val_and_test,test_size=0) #test data is provided



In [10]:
#define test data as provided
x_test = dftest.review
y_test = dftest.sentiment

### Train Word2Vec models

In this process, two Word2Vec models were trained using Continuous Bag of Words and Skip Gram models

In [11]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils



In [12]:
#label the reviews for training
def labelize_review_ug(review,label):
    result = []
    prefix = label
    for i, t in zip(review.index, review):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [13]:
all_x = pd.concat([x_train,x_val,x_test])
all_x_w2v = labelize_review_ug(all_x, 'all')

In [14]:
cores = multiprocessing.cpu_count()
model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564501.74it/s]


training word2vec using both methods

In [17]:
%%time
for epoch in range(30):
    model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_cbow.alpha -= 0.002
    model_ug_cbow.min_alpha = model_ug_cbow.alpha

100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564536.75it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1788051.53it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1788112.51it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1788036.29it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1788082.02it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564595.11it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564513.41it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1788234.49it/s]
100%|███████████████████████████████████

Wall time: 3min 7s


In [18]:
model_ug_sg = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1251687.30it/s]


In [19]:
%%time
for epoch in range(30):
    model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_sg.alpha -= 0.002
    model_ug_sg.min_alpha = model_ug_sg.alpha

100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564245.01it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1787975.31it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1788143.01it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564606.79it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564676.83it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1788112.51it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564688.50it/s]
100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1564630.13it/s]
100%|███████████████████████████████████

Wall time: 5min 43s


In [20]:
model_ug_cbow.save('w2v_model_ug_cbow.word2vec')
model_ug_sg.save('w2v_model_ug_sg.word2vec')

### Preparation for CNN
Then we start to prepare for CNN traing, first import word2vec using gensim library and concatenate them.

In [21]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')

In [22]:
#concatenateing vectors of 2 models
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))

Found 73744 word vectors.


In [23]:
from keras.preprocessing.text import Tokenizer #to split the words in a sentence
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)#take only the 100k most frequent words
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [24]:
#figure out the max number of words in one sentence
length = []
for x in x_train:
    length.append(len(x.split()))

In [25]:
max(length) #check the length we need to assign

1423

In [26]:
x_train_seq = pad_sequences(sequences,maxlen=1425)
print('Shape of tensor is',x_train_seq.shape)

Shape of tensor is (22500, 1425)


In [27]:
sequences_val = tokenizer.texts_to_sequences(x_val)
x_val_seq = pad_sequences(sequences_val,maxlen=1425)

In [28]:
num_words = 100000 #limitation of 100k most frequent words
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### CNN

In this CNN model, a simple one with one 1D conv layer and a functional API model were trained and compared. Second model performed sightly better than the simple one, so we will go with it.

In [29]:
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Embedding,Dense,Dropout,Activation,Input

In [30]:
data = Input(shape=(1425,),dtype='int32')
encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=1425, trainable=True)(data)

layer1 = Sequential() #bigram
layer1=Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(encoder)
layer1=GlobalMaxPooling1D()(layer1)

layer2 = Sequential() #trigram
layer2=Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(encoder)
layer2=GlobalMaxPooling1D()(layer2)


from keras.models import Model
from keras.layers import concatenate
merged = concatenate([layer1, layer2], axis=1)
merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[data], outputs=[output])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_val), epochs=5, batch_size=32, verbose=2)
#model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 22500 samples, validate on 2500 samples
Epoch 1/5
 - 610s - loss: 0.3599 - acc: 0.8378 - val_loss: 0.3329 - val_acc: 0.8556
Epoch 2/5
 - 605s - loss: 0.1632 - acc: 0.9386 - val_loss: 0.2804 - val_acc: 0.8928
Epoch 3/5
 - 603s - loss: 0.0425 - acc: 0.9868 - val_loss: 0.3241 - val_acc: 0.8948
Epoch 4/5
 - 600s - loss: 0.0124 - acc: 0.9964 - val_loss: 0.4175 - val_acc: 0.9008
Epoch 5/5
 - 602s - loss: 0.0070 - acc: 0.9977 - val_loss: 0.4581 - val_acc: 0.8944


<keras.callbacks.History at 0x1bb9efc0cc0>

In [None]:
"""Simple CNN
cnn = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=1425, trainable=True)
cnn.add(e)
cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
cnn.add(GlobalMaxPooling1D())
cnn.add(Dropout(0.2))
cnn.add(Dense(256, activation='relu'))
cnn.add(Dense(1, activation='sigmoid'))
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_val), epochs=5, batch_size=32, verbose=2)
"""

### Testing

In [32]:
sequences_test = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences_test, maxlen=1425)

In [46]:
#model.evaluate(x=x_test_seq, y=y_test)
y_pred = model.predict(x_test_seq)
prediction=[]
for i in range(25000):
    if (y_pred[i]<0.5):
        prediction.append('0')
    else:
        prediction.append('1')
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, prediction)

(array([0.87001638, 0.88927194]),
 array([0.89208, 0.86672]),
 array([0.88091006, 0.87785115]),
 array([12500, 12500], dtype=int64))

In [35]:
model.evaluate(x=x_test_seq, y=y_test)



[0.542779450494945, 0.8794]

### References

Data Import - https://github.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/blob/master/driver_3.py

Word2Vec Training - https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-10-neural-network-with-a6441269aa3c