# EMBEDDINGS + CONV1D

In [1]:
import re
import string
import pandas as pd
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('IMDB_Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df['sentiment'] = df['sentiment'].map( {'negative':0, 'positive':1})

### PRE-PROCESSING

In [5]:
def clean_process(review):
    
    # Removing <br><br />
    html = re.compile(r"<.*?>")
    review = html.sub(r" ", review)
    
    # Converting into lowercase
    review = review.lower()
    
    # Removing Stopwords
    stop = set(stopwords.words("english"))
    
    review = [word.lower() for word in review.split() if word.lower() not in stop]
    review = " ".join(review)
    
    # Removing punctuation
    table = str.maketrans("", "", string.punctuation)
    review = review.translate(table)
    
    return review

In [6]:
df['review'] = df['review'].map(lambda x: clean_process(x))

### TEXT ENCODING

In [7]:
from tensorflow.keras.preprocessing import text, sequence

In [8]:
vocab = 25000        # Choosing top 25k words as vocabulary
max_length = 200   # Max length of each review(input). Rest will be padded
embedding_dim = 100  # Word Embedding dimension

In [9]:
X = df['review'].values
y = df['sentiment'].values

In [10]:
tokenizer = text.Tokenizer(vocab)

tokenizer.fit_on_texts(list(X))
X_tokenized = tokenizer.texts_to_sequences(X)

In [11]:
# Padding is important as NN cannot handle input with different length
X_tokenized = sequence.pad_sequences(X_tokenized, maxlen=max_length)  

### CREATING MODEL

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tokenized, y, test_size=0.1, random_state=42)

In [14]:
import tensorflow as tf

tf.config.list_physical_devices('GPU')  # GPU is present

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [15]:
model = Sequential()

model.add(Embedding(vocab, embedding_dim))
model.add(Dropout(0.2))

model.add(Conv1D(250,3,padding='valid'))
model.add(MaxPooling1D())

model.add(Conv1D(250,5,padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         2500000   
_________________________________________________________________
dropout (Dropout)            (None, None, 100)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 250)         75250     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 250)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 250)         312750    
_________________________________________________________________
global_max_pooling1d (Global (None, 250)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               3

In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.fit(X_train, y_train, batch_size=32, validation_data=(X_test,y_test), epochs=2)

Train on 45000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x282897d1c88>

### PREDICTIONS + ACCURACY-SCORE

In [19]:
predictions = model.predict_classes(X_test, verbose=1, batch_size=32)



In [20]:
from sklearn.metrics import accuracy_score, classification_report

In [21]:
accuracy_score(y_test, predictions)

0.889

In [22]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.92      0.85      0.88      2481
           1       0.86      0.93      0.89      2519

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

