In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
!nvidia-smi

Sun Mar 31 06:50:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0              30W /  70W |    647MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
import tensorflow as tf
import tensorflow_hub as tf_hub
print(tf.__version__)
print(tf_hub.__version__)

2.15.0
0.16.1


In [6]:
print(tf.test.is_gpu_available())
print(tf.test.gpu_device_name())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True
/device:GPU:0


In [7]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
#!pip install --upgrade keras

In [9]:
import pandas as pd
import numpy as np
import nltk

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization



In [10]:

reviews = pd.read_csv('https://raw.githubusercontent.com/nursnaaz/GoogletoChatgpt/main/05.%20Building%20Text%20Classification/Sentiment%20Analysis/movie_reviews.csv')

In [11]:
reviews.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [12]:
reviews.sample(3000).sentiment.value_counts()

positive    1528
negative    1472
Name: sentiment, dtype: int64

In [13]:
reviews_sample = reviews.sample(3000)

In [14]:
reviews_sample.sentiment.value_counts()

negative    1500
positive    1500
Name: sentiment, dtype: int64

In [15]:
reviews_sample = reviews_sample.reset_index().drop(columns = 'index')

In [16]:
reviews_sample

Unnamed: 0,review,sentiment
0,This was a cute movie until the ending. The en...,negative
1,This movie was excellent. It details the strug...,positive
2,"This is not a horror film, but a boring sex mo...",negative
3,I don't know how anyone could hate this movie....,positive
4,"The Patriot is a well thought out, well produc...",positive
...,...,...
2995,I think this has the potential of being the be...,positive
2996,Riding Giants is a brilliant documentary that ...,positive
2997,This was on the Saturday before Halloween this...,positive
2998,"I, myself am a kid at heart, meaning I love wa...",positive


In [17]:

import nltk
import re
from bs4 import BeautifulSoup

stop_words = nltk.corpus.stopwords.words('english')


def strip_html(doc):
    soup = BeautifulSoup(doc,"html.parser")
    text = soup.get_text()
    return text


def normalize_document(doc):
    doc = strip_html(doc)
    # lower case and remove special characters\whitespaces
    #doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    #filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    #doc = ' '.join(filtered_tokens)
    doc = ' '.join(tokens)
    return doc

In [18]:
reviews_sample['review_sample'] = reviews_sample['review'].apply(normalize_document)

  soup = BeautifulSoup(doc,"html.parser")


In [19]:
reviews_sample

Unnamed: 0,review,sentiment,review_sample
0,This was a cute movie until the ending. The en...,negative,this was a cute movie until the ending . the e...
1,This movie was excellent. It details the strug...,positive,this movie was excellent . it details the stru...
2,"This is not a horror film, but a boring sex mo...",negative,"this is not a horror film , but a boring sex m..."
3,I don't know how anyone could hate this movie....,positive,i do n't know how anyone could hate this movie...
4,"The Patriot is a well thought out, well produc...",positive,"the patriot is a well thought out , well produ..."
...,...,...,...
2995,I think this has the potential of being the be...,positive,i think this has the potential of being the be...
2996,Riding Giants is a brilliant documentary that ...,positive,riding giants is a brilliant documentary that ...
2997,This was on the Saturday before Halloween this...,positive,this was on the saturday before halloween this...
2998,"I, myself am a kid at heart, meaning I love wa...",positive,"i , myself am a kid at heart , meaning i love ..."


In [20]:
X = reviews_sample['review_sample']
y = reviews_sample['sentiment']

In [21]:
max_features = 2000
Encoder = TextVectorization( max_tokens = max_features)
Encoder.adapt(X.values)

vocab = np.array(Encoder.get_vocabulary())
print(vocab[:20])

example ="This is an example to test the encoder that we just created!"
print(Encoder(example).numpy())
print(" ".join(vocab[Encoder(example).numpy()]))

['' '[UNK]' 'the' 'and' 'a' 'of' 'to' 'is' 'in' 'it' 'i' 'this' 'that' 's'
 'was' 'as' 'for' 'with' 'movie' 'but']
[  11    7   35  509    6    1    2    1   12   70   45 1258]
this is an example to [UNK] the [UNK] that we just created


In [22]:
max_features = 2000
tokenizer = Tokenizer(num_words = max_features, )
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, padding = 'post' ,maxlen=300)
Y = pd.get_dummies(y).values

vocab_size = len(tokenizer.word_index)+1


In [23]:
train_X, test_X, train_y, test_y = train_test_split(X,Y, test_size=0.33, random_state=23)

In [24]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((2010, 300), (990, 300), (2010, 2), (990, 2))

In [25]:
train_X

array([[1581,   16,    3, ...,    0,    0,    0],
       [   5,  477,  709, ...,    8,   75,   23],
       [  10,   13,    3, ...,    0,    0,    0],
       ...,
       [  24,  236,  123, ...,    0,    0,    0],
       [ 156,    1,  119, ...,    0,    0,    0],
       [  10,   19,   52, ...,    0,    0,    0]], dtype=int32)

In [26]:
from tqdm import tqdm
embedding_vector = {}
f = open('/content/drive/MyDrive/NLP/glove.6B.300d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

400000it [00:33, 12070.54it/s]


In [28]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in tqdm(tokenizer.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

100%|██████████| 32268/32268 [00:00<00:00, 198041.85it/s]


In [29]:
embedding_matrix.shape

(32269, 300)

In [30]:
vocab_size

32269

In [31]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Embedding
from keras import optimizers

In [32]:
def deep_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(vocab_size, embid_dim, input_length =X.shape[1], weights = [embedding_matrix] , trainable = False))
    model.add(LSTM(20, dropout=0.2, return_sequences = True))
    model.add(LSTM(20, dropout=0.2, return_sequences = True))
    model.add(LSTM(20, dropout=0.2, return_sequences = True))
    model.add(LSTM(20, return_sequences = False))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [35]:
deep_lstm().summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 300, 300)          9680700   
                                                                 
 lstm_8 (LSTM)               (None, 300, 20)           25680     
                                                                 
 lstm_9 (LSTM)               (None, 300, 20)           3280      
                                                                 
 lstm_10 (LSTM)              (None, 300, 20)           3280      
                                                                 
 lstm_11 (LSTM)              (None, 20)                3280      
                                                                 
 dense_2 (Dense)             (None, 2)                 42        
                                                                 
 activation_2 (Activation)   (None, 2)                

In [36]:
model = deep_lstm()
model.fit(train_X, train_y, epochs = 1, batch_size = 1, verbose = 1)



<keras.src.callbacks.History at 0x7db2eaf3c7f0>

In [47]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)



In [48]:

print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))

0.5262626262626262


In [51]:
from keras.layers import Bidirectional

In [52]:
def bidirectional_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(vocab_size, embid_dim, input_length =X.shape[1], weights = [embedding_matrix] , trainable = False))
    model.add(Bidirectional(LSTM(20, return_sequences = False)))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [53]:
model = bidirectional_lstm()
model.fit(train_X, train_y, epochs = 1, batch_size = 3, verbose = 1)



<keras.src.callbacks.History at 0x7ee22f66b130>

In [37]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)



In [38]:
print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))

0.48787878787878786


In [56]:
def deep_bidirectional_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(vocab_size, embid_dim, input_length =X.shape[1], weights = [embedding_matrix] , trainable = False))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = True)))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = True)))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = True)))
    model.add(Bidirectional(LSTM(10, dropout=0.2,return_sequences = False)))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [57]:
model = deep_bidirectional_lstm()
model.fit(train_X, train_y, epochs = 1, batch_size = 3, verbose = 1)



In [58]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)



In [59]:
print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))

0.5666666666666667


In [60]:
def bidirectional_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(vocab_size, embid_dim, input_length =X.shape[1], weights = [embedding_matrix] , trainable = True))
    model.add(Bidirectional(LSTM(20, return_sequences = False)))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [64]:
model = bidirectional_lstm()
model.fit(train_X, train_y, epochs = 1, batch_size = 3, verbose = 1)



<keras.src.callbacks.History at 0x7ee21450f4c0>

In [65]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)



In [66]:
print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))

0.6828282828282828


In [67]:
from gensim.models import KeyedVectors
filename = '/content/drive/MyDrive/NLP/word2vec-google-news-300.bin'
w2v_pretrained_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [68]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in tqdm(tokenizer.word_index.items()):
    try:
        embedding_value = w2v_pretrained_model[word]
        if embedding_value is not None:
            embedding_matrix[i] = embedding_value
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),300)

100%|██████████| 31626/31626 [00:00<00:00, 102798.68it/s]


In [73]:
def bidirectional_lstm():
    embid_dim = 300
    lstm_out = 128
    model = Sequential()
    model.add(Embedding(vocab_size, 300, input_length =300, weights = [embedding_matrix ] , trainable = True))
    model.add(Bidirectional(LSTM(20, return_sequences = False)))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    adam = optimizers.Adam()
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])

    return model

In [74]:
model = bidirectional_lstm()
model.fit(train_X, train_y, epochs = 3, batch_size = 3, verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7ee21bada530>

In [75]:
y_pred = model.predict(test_X)
y_test_ = np.argmax(y_pred, axis = 1)



In [76]:
print(accuracy_score(np.argmax(test_y, axis = 1), y_test_))

0.7707070707070707
