In [1]:
import pandas as pd
import numpy as np

## Using our local dataset:

In [5]:
train = pd.read_csv('./imdb_small.csv')

map_dict = {'negative':0,'positive':1}
train =train.replace({'sentiment':map_dict})  

train['review'] = train['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['review'].head()

0    i have to differ from the other comments poste...
1    i saw this movie with low expectations and was...
2    taran adarsh a reputed critic praised such a d...
3    when i first heard that the subject matter for...
4    with the release of peter jackson's famed "lor...
Name: review, dtype: object

### Tokenize only after doing all the necesarry preprocessing steps: (such as lower-casing, etc...)
- Tokenizer - Numbers for each word is assigned based on their frequency. Most occuring word takes the value of 1. '0' is used for padding.
- max_features = 10 - Only the top 9 (excluding '0' for padding) frequently occuring words are taken as features

In [138]:
# Preprocessing the dataset

import num2words
from textblob import Word
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def convertnum2words (sentence):
    new_sentence = sentence
    for i in sentence.split():
        if i.isdigit():
            sentence = sentence.replace(i,num2words.num2words(int(i)))
    return sentence

def countstopwords(sentence):
    count = 0
    for i in (sentence.split()):
        if i in stop_words:
            count +=1 
    return count

train['review'] = train['review'].apply(lambda x: x.replace('<br />','.'))
train['review'] = train['review'].str.replace('[^\w\s]','')
train['review'] = train['review'].apply(lambda x: convertnum2words(x))
train['review'] = train['review'].str.replace('-',' ')
train['review'] = train['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['review'] = train['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
train['review'] = train['review'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [163]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 10000 # vocabulary_size

tokenizer = Tokenizer(num_words=max_features, lower = True, split=' ')
tokenizer.fit_on_texts(train['review'])

x = tokenizer.texts_to_sequences(train['review'])

# By default Pre-Sequence Padding is followed - if you want to use Post Padding use padding='post'
x = pad_sequences(x) 

word_index = tokenizer.word_index
print(len(x[0]))

753


### Basic Neural Network:

In [169]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten

embedding_size=300
max_words = len(x[0])

model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(Flatten())
model.add(Dense(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

batch_size = 64
num_epochs = 3

from sklearn.model_selection import train_test_split
X_train2, X_valid, y_train2, y_valid = train_test_split(x,train['sentiment'], test_size = 0.33, random_state = 123)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 753, 300)          3000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 225900)            0         
_________________________________________________________________
dense_15 (Dense)             (None, 100)               22590100  
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 101       
Total params: 25,590,201
Trainable params: 25,590,201
Non-trainable params: 0
_________________________________________________________________
None
Train on 3350 samples, validate on 1650 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x2e854ac8d48>

### LSTM model:

In [168]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=300
max_words = len(x[0])

model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

batch_size = 64
num_epochs = 3

from sklearn.model_selection import train_test_split
X_train2, X_valid, y_train2, y_valid = train_test_split(x,train['sentiment'], test_size = 0.33, random_state = 123)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 753, 300)          3000000   
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 101       
Total params: 3,160,501
Trainable params: 3,160,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 3350 samples, validate on 1650 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x2e85484f288>

## Using the IMDB dataset from keras:

In [174]:
from keras.datasets import imdb
vocabulary_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Loaded dataset with 25000 training samples, 25000 test samples


In [147]:
print("'Review':",X_train[0])
print("\n'Label':",y_train[0])

'Review': [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]

'Label': 1


In [148]:
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print("'Review':",[id2word.get(i, ' ') for i in X_train[0]])
print("\n'Label':",y_train[0])

'Review': ['the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their', 'becomes', 'reaching', 'had', 'journalist', 'of', 'lot', 'from', 'anyone', 'to', 'have', 'after', 'out', 'atmosphere', 'never', 'more', 'room', 'and', 'it', 'so', 'heart', 'shows', 'to', 'years', 'of', 'every', 'never', 'going', 'and', 'help', 'moments', 'or', 'of', 'every', 'chest', 'visual', 'movie', 'except', 'her', 'was', 'several', 'of', 'enough', 'more', 'with', 'is', 'now', 'current', 'film', 'as', 'you', 'of', 'mine', 'potentially', 'unfortunately', 'of', 'you', 'than', 'him', 'that', 'with', 'out', 'themselves', 'her', 'get', 'for', 'was', 'camp', 'of', 'you', 'movie', 'sometimes', 'movie', 'that', 'with', 'scary', 'but', 'and', 'to', 'story', 'wonderful', 'that', 'in', 'seeing', 'in', 'character', 'to', 'of', '70s', 'musicians', 'with', 'heart', 'had', 'shadows', 'they', 'of', 'here', 'that', 'with', 'her', 'serious', 'to', 'have', 'does', 'when', 'from', 'why', 'what', 'have', '

In [None]:
print('Maximum review length: {}'.format(len(max((X_train + X_test), key=len))))
print('Minimum review length: {}'.format(len(min((X_train + X_test), key=len))))

In [176]:
from keras.preprocessing import sequence
max_words = 1000

# By default Pre-Sequence Truncation is followed - if you want to use Post-Sequence Truncation use truncating='post'
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

### Basic Neural Network:

In [136]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten

batch_size = 64
num_epochs = 3
embedding_size=300

model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(Flatten())
model.add(Dense(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 1000, 300)         3000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 300000)            0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)               30000100  
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 101       
Total params: 33,000,201
Trainable params: 33,000,201
Non-trainable params: 0
_________________________________________________________________
None
Train on 16750 samples, validate on 8250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x2e72af5ac08>

### LSTM model:

In [177]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=300

model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

batch_size = 64
num_epochs = 3

from sklearn.model_selection import train_test_split
X_train2, X_valid, y_train2, y_valid = train_test_split(X_train,y_train, test_size = 0.33, random_state = 123)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 1000, 300)         3000000   
_________________________________________________________________
lstm_12 (LSTM)               (None, 100)               160400    
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 101       
Total params: 3,160,501
Trainable params: 3,160,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 16750 samples, validate on 8250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x2e713e0c948>