In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
#!pip install --upgrade keras

In [3]:
import pandas as pd
import numpy as np
import nltk

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization



In [4]:
reviews = pd.read_csv('https://raw.githubusercontent.com/nursnaaz/GoogletoChatgpt/main/05.%20Building%20Text%20Classification/Sentiment%20Analysis/movie_reviews.csv')

In [5]:
reviews.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
reviews.sample(3000).sentiment.value_counts()

negative    1530
positive    1470
Name: sentiment, dtype: int64

In [7]:
reviews_sample = reviews.sample(3000)

In [8]:
reviews_sample.sentiment.value_counts()

positive    1523
negative    1477
Name: sentiment, dtype: int64

In [9]:
reviews_sample = reviews_sample.reset_index().drop(columns = 'index')

In [10]:
reviews_sample

Unnamed: 0,review,sentiment
0,I'm a Black man living in a predominantly Blac...,negative
1,"I saw this a good while ago, but i just cant g...",positive
2,This is one of the silliest movies I have ever...,negative
3,In the unlikely case that some aspiring direct...,negative
4,I gave this two stars for the awesome DV shot ...,negative
...,...,...
2995,I have wanted to see this for the longest time...,positive
2996,This movie down-shifts from 4th into 1st witho...,negative
2997,This is a Japanese film but there is quite a b...,negative
2998,This is a very good movie. Do you want to know...,positive


In [11]:
import nltk
import re
from bs4 import BeautifulSoup

stop_words = nltk.corpus.stopwords.words('english')


def strip_html(doc):
    soup = BeautifulSoup(doc,"html.parser")
    text = soup.get_text()
    return text


def normalize_document(doc):
    doc = strip_html(doc)
    # lower case and remove special characters\whitespaces
    #doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    #filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    #doc = ' '.join(filtered_tokens)
    doc = ' '.join(tokens)
    return doc

In [12]:
reviews_sample['review_sample'] = reviews_sample['review'].apply(normalize_document)

In [13]:
reviews_sample

Unnamed: 0,review,sentiment,review_sample
0,I'm a Black man living in a predominantly Blac...,negative,i 'm a black man living in a predominantly bla...
1,"I saw this a good while ago, but i just cant g...",positive,"i saw this a good while ago , but i just cant ..."
2,This is one of the silliest movies I have ever...,negative,this is one of the silliest movies i have ever...
3,In the unlikely case that some aspiring direct...,negative,in the unlikely case that some aspiring direct...
4,I gave this two stars for the awesome DV shot ...,negative,i gave this two stars for the awesome dv shot ...
...,...,...,...
2995,I have wanted to see this for the longest time...,positive,i have wanted to see this for the longest time...
2996,This movie down-shifts from 4th into 1st witho...,negative,this movie down-shifts from 4th into 1st witho...
2997,This is a Japanese film but there is quite a b...,negative,this is a japanese film but there is quite a b...
2998,This is a very good movie. Do you want to know...,positive,this is a very good movie . do you want to kno...


In [14]:
X = reviews_sample['review_sample']
y = reviews_sample['sentiment']

In [15]:
max_features = 2000
Encoder = TextVectorization( max_tokens = max_features)
Encoder.adapt(X.values)

vocab = np.array(Encoder.get_vocabulary())
print(vocab[:20])

example ="This is an example to test the encoder that we just created!"
print(Encoder(example).numpy())
print(" ".join(vocab[Encoder(example).numpy()]))

['' '[UNK]' 'the' 'and' 'a' 'of' 'to' 'is' 'it' 'in' 'i' 'this' 'that' 's'
 'was' 'as' 'with' 'for' 'movie' 'but']
[  11    7   35  446    6    1    2    1   12   73   42 1088]
this is an example to [UNK] the [UNK] that we just created


In [16]:
max_features = 2000
tokenizer = Tokenizer(num_words = max_features, )
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, padding = 'post' ,maxlen=300)
Y = pd.get_dummies(y).values

vocab_size = len(tokenizer.word_index)+1


In [17]:
train_X, test_X, train_y, test_y = train_test_split(X,Y, test_size=0.33, random_state=23)

In [18]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((2010, 300), (990, 300), (2010, 2), (990, 2))

In [27]:
train_X

array([[ 517,  613,  225, ...,    0,    0,    0],
       [   6,    1,   69, ...,    0,    0,    0],
       [1192,   47,   98, ...,    0,    0,    0],
       ...,
       [   1,  762,   17, ...,    6,    3,  162],
       [  59,   52, 1180, ...,    1,  195,  245],
       [   9,  981,   17, ...,    0,    0,    0]], dtype=int32)

In [35]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Embedding
from keras import optimizers

In [36]:
def deep_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(max_features, embid_dim, input_length = X.shape[1]))
    model.add(LSTM(20, dropout=0.2, return_sequences = True))
    model.add(LSTM(20, dropout=0.2, return_sequences = True))
    model.add(LSTM(20, dropout=0.2, return_sequences = True))
    model.add(LSTM(20, return_sequences = False))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [37]:
deep_lstm().summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 300, 300)          600000    
                                                                 
 lstm_13 (LSTM)              (None, 300, 20)           25680     
                                                                 
 lstm_14 (LSTM)              (None, 300, 20)           3280      
                                                                 
 lstm_15 (LSTM)              (None, 300, 20)           3280      
                                                                 
 lstm_16 (LSTM)              (None, 20)                3280      
                                                                 
 dense_4 (Dense)             (None, 2)                 42        
                                                                 
 activation_4 (Activation)   (None, 2)                

In [38]:
model = deep_lstm()
model.fit(train_X, train_y, epochs = 10, batch_size = 100, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x793ccb49eb60>

In [41]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)



In [42]:

print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))

0.4909090909090909


In [43]:
from keras.layers import Bidirectional

In [44]:
def bidirectional_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(max_features, embid_dim, input_length = X.shape[1]))
    model.add(Bidirectional(LSTM(20, return_sequences = False)))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [46]:
model = bidirectional_lstm()
model.fit(train_X, train_y, epochs = 10, batch_size = 1, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x793cd86fcac0>

In [47]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)



In [48]:
print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))

0.6777777777777778


In [None]:
def deep_bidirectional_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(max_features, embid_dim, input_length = X.shape[1]))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = True)))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = True)))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = True)))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = False)))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [None]:
model = deep_bidirectional_lstm()
model.fit(train_X, train_y, epochs = 10, batch_size = 100, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

In [None]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)

In [None]:
print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))