<a href="https://colab.research.google.com/github/panthi03/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load and Preprocess Dataset
data = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')

Mounted at /content/drive


In [None]:
def preprocess_data(data):
    # Clean text: remove HTML, non-alphabet characters, lowercase, and remove stopwords
    stop_words = set(stopwords.words('english'))
    data['review'] = data['review'].apply(lambda x: re.sub('<.*?>', '', str(x)))
    data['review'] = data['review'].apply(lambda x: re.sub('[^A-Za-z]', ' ', x))
    data['review'] = data['review'].apply(lambda x: ' '.join(word.lower() for word in x.split() if word not in stop_words))
    return data

data = preprocess_data(data)

In [None]:
# Encode Labels: 1 for positive, 0 for negative
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

In [None]:
# Tokenize and Pad Sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [None]:
def get_max_length():
  review_length = []
  for review in x_train:
    review_length.append(len(review))
  return int(np.ceil(np.mean(review_length)))

# Define padding length based on mean length of reviews
max_length = get_max_length()
x_train = pad_sequences(x_train, maxlen=max_length, padding='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Define Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length))
print(model.summary())

None


In [None]:
# Callbacks for Checkpointing and Early Stopping
checkpoint = ModelCheckpoint(
    filepath='best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

In [None]:
# Train the Model
history = model.fit(
    x_train, y_train,
    batch_size=128,
    epochs=10,
    validation_data=(x_test, y_test),
    callbacks=[checkpoint, early_stopping]
)

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step - accuracy: 0.5551 - loss: 0.6805
Epoch 1: val_accuracy improved from -inf to 0.67120, saving model to best_model.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 251ms/step - accuracy: 0.5552 - loss: 0.6804 - val_accuracy: 0.6712 - val_loss: 0.6251
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step - accuracy: 0.6811 - loss: 0.6111
Epoch 2: val_accuracy improved from 0.67120 to 0.69220, saving model to best_model.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 245ms/step - accuracy: 0.6811 - loss: 0.6111 - val_accuracy: 0.6922 - val_loss: 0.6045
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step - accuracy: 0.7715 - loss: 0.5126
Epoch 3: val_accuracy improved from 0.69220 to 0.82620, saving model to best_model.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s

In [None]:
# Evaluate the Model
y_pred = (model.predict(x_test, batch_size=128) >= 0.5).astype(int).reshape(-1)
correct_predictions = np.sum(y_pred == y_test)
total_predictions = len(y_test)
accuracy = correct_predictions / total_predictions * 100

print(f"Correct Predictions: {correct_predictions}")
print(f"Wrong Predictions: {total_predictions - correct_predictions}")
print(f"Accuracy: {accuracy:.2f}%")

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 126ms/step
Correct Predictions: 8610
Wrong Predictions: 1390
Accuracy: 86.10%


In [None]:
from tensorflow.keras.models import load_model

# Load Saved Model
loaded_model = load_model('best_model.keras')

In [None]:
review = str(input('Movie Review: '))

Movie Review: Nothing was typical about this. Everything was beautifully done in this movie, the story, the flow, the scenario, everything. I highly recom mend it for mystery lovers, for anyone who wants to watch a good movie!


In [None]:
english_stops = set(stopwords.words('english'))

# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)
words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]
print('Filtered: ', filtered)

Cleaned:  Nothing was typical about this Everything was beautifully done in this movie the story the flow the scenario everything I highly recom mend it for mystery lovers for anyone who wants to watch a good movie
Filtered:  ['nothing typical everything beautifully done movie story flow scenario everything i highly recom mend mystery lovers anyone wants watch good movie']


In [None]:
tokenize_words = tokenizer.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length,
padding='post', truncating='post')
print(tokenize_words)

[[   77   691   172  1167   129     3    13  2734  2563   172     1   445
  23067   709  1790   153   397    33     9     3     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]


In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 571ms/step
[[0.85426843]]


In [None]:
if result >= 0.7:
  print('positive')
else:
  print('negative')

positive
