<a href="https://www.kaggle.com/code/manikverma2/sentiment-analysis-using-imdb?scriptVersionId=242409809" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Load IMDB dataset (or any CSV of reviews)
df = pd.read_csv('/kaggle/input/imdb-dataset-sentiment-analysis/IMDB_dataset.csv')  # Columns: 'review', 'sentiment'

# Optional: if using raw data without labels, use VADER
def get_sentiment_score(text):
    return sid.polarity_scores(text)['compound']

df['vader_score'] = df['review'].apply(get_sentiment_score)
df['label'] = df['vader_score'].apply(lambda x: 1 if x >= 0 else 0)  # Binary label


In [None]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML
    text = re.sub(r'[^a-zA-Z ]', '', text)  # remove non-letters
    return text.lower()

df['clean_review'] = df['review'].apply(clean_text)


In [None]:
MAX_WORDS = 10000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['clean_review'])

X = tokenizer.texts_to_sequences(df['clean_review'])
X = pad_sequences(X, maxlen=MAX_LEN)
y = df['label'].values


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

train_history= model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)


In [None]:
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

In [None]:
show_train_history(train_history,'accuracy','val_accuracy')

In [None]:
show_train_history(train_history,'loss','val_loss')

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2%}")


In [None]:
df['sentiment'] = df['label'].map({1: 'Positive', 0: 'Negative'})
df['is_superhero'] = df['review'].str.contains('superhero|marvel|avengers|batman', case=False)

sentiment_trend = df.groupby('is_superhero')['sentiment'].value_counts(normalize=True).unstack()

sentiment_trend.plot(kind='bar', stacked=True, colormap='coolwarm')
plt.title('Sentiment Distribution for Superhero vs. Non-Superhero Movies')
plt.ylabel('Proportion')
plt.xticks(ticks=[0, 1], labels=['Non-Superhero', 'Superhero'], rotation=0)
plt.show()


Mulitclass


In [None]:
def classify_sentiment(score):
    if score >= 0.05:
        return 1     # Positive
    elif score <= -0.05:
        return -1    # Negative
    else:
        return 0     # Neutral

df['vader_score'] = df['review'].apply(get_sentiment_score)
df['label'] = df['vader_score'].apply(classify_sentiment)


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Convert -1, 0, 1 to 0, 1, 2
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['label'])  # -1 → 0, 0 → 1, 1 → 2
y_cat = to_categorical(y_encoded, num_classes=3)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)


In [None]:
model = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    LSTM(64),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Multiclass Accuracy: {accuracy:.2%}")


In [None]:
# Example prediction
pred_probs = model.predict(X_test[:5])
pred_classes = np.argmax(pred_probs, axis=1)

# Map back to [-1, 0, 1]
decoded_preds = label_encoder.inverse_transform(pred_classes)
print("Predicted Sentiment Classes:", decoded_preds)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
labels = ['Negative', 'Neutral', 'Positive']

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Sentiment Classification')
plt.show()

# Classification Report
print(classification_report(y_true, y_pred, target_names=labels))


In [None]:
# Use original labels (-1, 0, 1)
sentiment_labels = {-1: "Negative", 0: "Neutral", 1: "Positive"}
df['sentiment_category'] = df['label'].map(sentiment_labels)

plt.figure(figsize=(6, 4))
sns.countplot(x='sentiment_category', data=df, order=['Negative', 'Neutral', 'Positive'], palette='coolwarm')
plt.title('Sentiment Distribution in IMDB Reviews')
plt.ylabel('Number of Reviews')
plt.xlabel('Sentiment')
plt.show()


HYPERTUNING

In [None]:
# Parameters to experiment with
EMBEDDING_DIM = 128  # Try: 100, 200
LSTM_UNITS = 64      # Try: 32, 128
DROPOUT_RATE = 0.5   # Try: 0.3, 0.6
BATCH_SIZE = 128     # Try: 64, 256
EPOCHS = 5           # Try: 10 or use EarlyStopping


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=128,
    callbacks=[early_stop, lr_schedule]
)


In [None]:
from tensorflow.keras.layers import Bidirectional

model = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    LSTM(32),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(3, activation='softmax')
])


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Multiclass Accuracy: {accuracy:.2%}")


In [None]:
# Example prediction
pred_probs = model.predict(X_test[:5])
pred_classes = np.argmax(pred_probs, axis=1)

# Map back to [-1, 0, 1]
decoded_preds = label_encoder.inverse_transform(pred_classes)
print("Predicted Sentiment Classes:", decoded_preds)

In [None]:
model.save("sentiment_model.h5")
import pickle

# Save tokenizer and label encoder
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
