In [1]:
import numpy as np
import pandas as pd
import string
from gensim.models import Word2Vec
import nltk
nltk.download('punkt') 

In [2]:
df = pd.read_csv('IMDB Dataset.csv', sep = ",", skipinitialspace=True, engine="python")

# Remove <br /><br/> from 'review' column
df['review'] = df['review'].str.replace('<br /><br/>', '')

# Remove punctuations from 'review' column
punctuations = string.punctuation
df['review'] = df['review'].apply(lambda x: ''.join([c for c in x if c not in punctuations]))

In [3]:
# Tokenize the reviews into lists of words
df['tokenized_review'] = df['review'].apply(lambda x: nltk.word_tokenize(x.lower()))  # Assuming 'review' column

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokenized_review'], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# Function to calculate average word embedding for a review
def average_word_embedding(review):
    words = nltk.word_tokenize(review.lower())
    word_embeddings = [word_vectors[word] for word in words if word in word_vectors]
    if len(word_embeddings) > 0:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)  # Return zero vector if no embeddings found

# Apply function to 'review' column
df['review_embedding'] = df['review'].apply(average_word_embedding)

# Map 'sentiment' column to 0 or 1 for negative and positive respectively
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\camd1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df.sample(5)

Unnamed: 0,review,sentiment,tokenized_review,review_embedding
45610,24 has got to be the best spyadventure series ...,1,"[24, has, got, to, be, the, best, spyadventure...","[-0.95238674, -0.31178418, -0.1588415, 0.09023..."
6830,New York has never looked so good And neither ...,1,"[new, york, has, never, looked, so, good, and,...","[-1.3106143, -0.086647525, -0.10170972, 0.4157..."
29118,Aside from a few good moments of fairly raw vi...,0,"[aside, from, a, few, good, moments, of, fairl...","[-0.8977181, 0.14631343, -0.08662392, 0.336782..."
32129,Cheezy Yep Poorly filmed You betcha Zero budge...,1,"[cheezy, yep, poorly, filmed, you, betcha, zer...","[-1.0324874, -0.12533535, -0.39670172, 0.18902..."
523,Today I wrote this review in anger at Uwe Boll...,0,"[today, i, wrote, this, review, in, anger, at,...","[-0.7291712, 0.097294055, -0.29913384, 0.51765..."
29261,50 years old this musical comedy fantasy might...,1,"[50, years, old, this, musical, comedy, fantas...","[-0.9775545, 0.0024162694, -0.3393744, 0.07167..."
28066,As someone who was in a PanHellenic sorority I...,1,"[as, someone, who, was, in, a, panhellenic, so...","[-1.0431981, 0.12176414, -0.29720336, 0.308971..."
7610,I was shocked to learn that Jimmy Caan has lef...,1,"[i, was, shocked, to, learn, that, jimmy, caan...","[-1.1258941, -0.18171099, -0.050577693, 0.4533..."
29892,Well Im probably about to be lambasted by ever...,0,"[well, im, probably, about, to, be, lambasted,...","[-0.99685186, -0.037667006, -0.2719241, 0.2864..."
18601,Its nice to see Julie Andrews trying a straigh...,0,"[its, nice, to, see, julie, andrews, trying, a...","[-0.97960305, 0.12888892, -0.43117398, 0.14444..."


In [5]:
selected_columns = ['review_embedding', 'sentiment']
data = df[selected_columns].values

In [6]:
data

array([[array([-0.9513426 , -0.02397121, -0.12255315,  0.3354263 , -1.4037868 ,
                1.0548606 , -0.4865392 ,  0.13351527, -0.08536989, -0.23059778,
               -0.09795094,  0.03521045,  1.0278268 ,  0.06668427, -0.02125504,
               -1.126511  ,  0.32466528, -0.49758458, -0.24695927, -0.0548237 ,
                0.41318488,  0.13398802, -0.1160256 , -0.33896014,  0.14557794,
               -0.34403348,  0.2824129 ,  0.89386445, -0.3502002 ,  0.2222087 ,
               -0.3336854 , -0.62548083, -0.19142842,  0.20304705,  0.69746584,
                0.3672135 ,  0.6781977 ,  0.25282654, -0.13364556, -0.22052471,
               -0.38821697,  0.25884607, -0.11557993, -0.2729749 , -0.36372152,
               -0.12730147, -0.32230228,  0.02983576, -0.23655455, -0.40755767,
               -0.91136086, -0.41270146, -0.22263095, -0.30263928,  0.17271195,
               -0.5551144 , -0.64421606,  0.12812427,  0.15286282, -0.3036409 ,
               -0.24587585,  0.01136061,

In [35]:
# Extract 100 columns from 'review_embedding' as individual columns
embedding_columns = pd.DataFrame(df['review_embedding'].to_list())
in_df= pd.concat([df['sentiment'], embedding_columns], axis=1)

# Rename the columns if needed
in_df.columns = ['sentiment'] + [f'embedding_{i+1}' for i in range(100)]

In [36]:
in_df.head()

Unnamed: 0,sentiment,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_91,embedding_92,embedding_93,embedding_94,embedding_95,embedding_96,embedding_97,embedding_98,embedding_99,embedding_100
0,1,-0.951343,-0.023971,-0.122553,0.335426,-1.403787,1.054861,-0.486539,0.133515,-0.08537,...,-0.982816,-0.032958,0.612424,-0.180149,0.521195,-0.452071,-0.565828,-0.122856,0.764298,0.998624
1,1,-1.029693,0.270748,-0.492355,0.057257,-1.347134,1.029047,-0.498152,0.147799,-0.40881,...,-0.594692,0.192804,0.292619,-0.040247,0.593361,-0.579156,-0.49077,-0.257338,0.62024,1.014322
2,1,-0.987237,-0.140457,-0.054182,0.18687,-1.52212,1.220173,-0.665484,0.201629,-0.433409,...,-0.946182,0.135247,0.864433,-0.267331,0.447788,-0.46244,-0.654169,0.027405,0.938125,0.969377
3,0,-0.940849,-0.083448,-0.337355,0.174758,-1.464907,1.069736,-0.803298,0.144738,-0.15474,...,-0.839485,-0.050215,0.538085,-0.202293,0.503333,-0.282043,-0.685682,-0.032535,0.561879,1.201313
4,1,-0.889705,0.043528,-0.176583,0.395761,-1.404079,1.060009,-0.784908,0.202374,-0.296726,...,-0.839166,0.160202,0.272361,-0.144758,0.363514,-0.80121,-0.502503,-0.270011,0.631851,1.014732


In [37]:
data = np.array(in_df)

In [38]:
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets

data_dev = data[0:10000].T
Y_dev = data_dev[0].astype(int)
X_dev = data_dev[1:n]

data_train = data[10000:m].T
Y_train = data_train[0].astype(int)
X_train = data_train[1:n]
_,m_train = X_train.shape

In [39]:
Y_train

array([1, 1, 1, ..., 0, 0, 0])

In [34]:
X_train[:,0].shape,X_train

((100,),
 array([[-0.87956768, -0.73631531, -0.8545354 , ..., -0.97417092,
         -1.0056839 , -0.86324763],
        [-0.02661741, -0.17403877,  0.02949989, ..., -0.01044715,
          0.06447854,  0.08232632],
        [-0.16744909, -0.04653493, -0.12394542, ..., -0.34143135,
         -0.4395932 , -0.31388256],
        ...,
        [-0.19976753, -0.23420981,  0.10240328, ..., -0.00171843,
         -0.08190659,  0.09097137],
        [ 0.55659956,  0.8287304 ,  0.68362796, ...,  0.82526076,
          0.79880774,  1.0200634 ],
        [ 0.87457335,  0.91303498,  0.75762075, ...,  0.97691941,
          0.90859276,  0.92360681]]))

In [44]:
# Initialize parameters from -0.5 to 0.5
def init_params():
    W1 = np.random.rand(2, 100) - 0.5
    b1 = np.random.rand(2, 1) - 0.5
    W2 = np.random.rand(2, 2) - 0.5
    b2 = np.random.rand(2, 1) - 0.5
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A
    
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def ReLU_deriv(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

In [42]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2

In [45]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

Iteration:  0
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.505425
Iteration:  10
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.4995
Iteration:  20
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.498175
Iteration:  30
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.4987
Iteration:  40
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.49915
Iteration:  50
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.50025
Iteration:  60
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.500975
Iteration:  70
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.50185
Iteration:  80
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.5023
Iteration:  90
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.50215
Iteration:  100
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.50265
Iteration:  110
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.5027
Iteration:  120
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.503475
Iteration:  130
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.50405
Iteration:  140
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.5045
Iteration:  150
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]
0.504875
Iteration:  160
[0 0 0 ... 0 0 1] [1 1 1 ... 0 0 0]

~80% accuracy on training set.

In [49]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

def test_prediction(index, W1, b1, W2, b2):
    current_review = X_train[:, index, None]
    prediction = make_predictions(X_train[:, index, None], W1, b1, W2, b2)
    label = Y_train[index]
    print("Prediction: ", prediction)
    print("Label: ", label)

Let's look at a couple of examples:

In [50]:
test_prediction(0, W1, b1, W2, b2)
test_prediction(1, W1, b1, W2, b2)
test_prediction(2, W1, b1, W2, b2)
test_prediction(3, W1, b1, W2, b2)

Prediction:  [0]
Label:  1
Prediction:  [1]
Label:  1
Prediction:  [1]
Label:  1
Prediction:  [1]
Label:  1


Finally, let's find the accuracy on the dev set:

In [51]:
dev_predictions = make_predictions(X_dev, W1, b1, W2, b2)
get_accuracy(dev_predictions, Y_dev)

[0 0 0 ... 1 1 0] [0 0 0 ... 1 1 0]


0.795

Still ~80% accuracy, so our model generalized from the training data pretty well.