In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.metrics import classification_report, accuracy_score
import nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Sem7/IMDB Dataset.csv')


In [None]:
data

In [None]:
def remove_tags(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    return text

In [None]:
data['review'] = data['review'].apply(remove_tags)

In [None]:
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


In [None]:
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

data['review'] = data['review'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels)


In [None]:
vocab_size = 3000  # Adjust based on your dataset
embedding_dim = 100
max_length = 200
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          300000    
                                                                 
 bidirectional (Bidirection  (None, 200, 128)          84480     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 486417 (1.86 MB)
Trainable params: 486417 

In [None]:
num_epochs = 5

history = model.fit(train_padded, train_labels, epochs=num_epochs,  validation_split=0.1, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
test_loss, test_acc = model.evaluate(test_padded, test_labels, verbose=1)
print("Test accuracy:", test_acc)

# Predictions
predictions = model.predict(test_padded)
pred_labels = [1 if p >= 0.5 else 0 for p in predictions]

print("Classification Report:\n", classification_report(test_labels, pred_labels))
print("Accuracy of prediction on test set:", accuracy_score(test_labels, pred_labels))



Test accuracy: 0.8629999756813049
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.80      0.85      5000
           1       0.82      0.92      0.87      5000

    accuracy                           0.86     10000
   macro avg       0.87      0.86      0.86     10000
weighted avg       0.87      0.86      0.86     10000

Accuracy of prediction on test set: 0.863


In [None]:
# Predict sentiment for custom sentences
custom_sentences = ["The concept of movie is good but I have watched better movies",
                    "Lovely movie, the actors have acted excellently",
                    "The movie plot is terrible but it also had bad acting"]

custom_sequences = tokenizer.texts_to_sequences(custom_sentences)
custom_padded = pad_sequences(custom_sequences, maxlen=max_length, padding='post', truncating='post')

custom_predictions = model.predict(custom_padded)
custom_pred_labels = [1 if p >= 0.5 else 0 for p in custom_predictions]

for i in range(len(custom_sentences)):
    print(custom_sentences[i])
    if custom_pred_labels[i] == 1:
        print("Predicted sentiment: Positive")
    else:
        print("Predicted sentiment: Negative")

The concept of movie is good but I have watched better movies
Predicted sentiment: Negative
Lovely movie, the actors have acted excellently
Predicted sentiment: Positive
The movie plot is terrible but it also had bad acting
Predicted sentiment: Negative
