In [4]:
# ------------------------------------------------------
# IMPORTS
# ------------------------------------------------------
import numpy as np
import pandas as pd
import re

# ML imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# ------------------------------------------------------
# LOAD INBUILT IMDB DATASET
# ------------------------------------------------------
# We use Keras built-in IMDB dataset (no CSV needed)
from keras.datasets import imdb

# Load top 10,000 words from IMDB
(X_train_raw, y_train_raw), (X_test_raw, y_test_raw) = imdb.load_data(num_words=10000)

print("Inbuilt IMDB dataset loaded!")
print("Train size:", len(X_train_raw))
print("Test size:", len(X_test_raw))


# ------------------------------------------------------
# DECODE INTEGER SEQUENCES TO TEXT
# IMDB gives word indices, so we convert them back to words.
# ------------------------------------------------------
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(encoded_review):
    return " ".join([reverse_word_index.get(i - 3, "?") for i in encoded_review])

# Convert all encoded reviews to raw text
X_train_text = [decode_review(review) for review in X_train_raw]
X_test_text  = [decode_review(review) for review in X_test_raw]

# Merge them like CSV-style dataset
X_total = X_train_text + X_test_text
y_total = np.concatenate((y_train_raw, y_test_raw))

# Create DataFrame
df = pd.DataFrame({"review": X_total, "sentiment": y_total})
df = df[:5000]  # Take only 5000 reviews for faster training

print("\nDataFrame created from inbuilt dataset.")
print(df.head())


# ------------------------------------------------------
# REMOVE HTML TAGS
# ------------------------------------------------------
def remove_html(text):
    return re.sub(re.compile('<.*?>'), '', text)

df['review'] = df['review'].apply(remove_html)


# ------------------------------------------------------
# REMOVE STOPWORDS (using sklearn)
# ------------------------------------------------------
stop_words = list(ENGLISH_STOP_WORDS)

df['review'] = df['review'].apply(
    lambda x: " ".join([word for word in x.split() if word.lower() not in stop_words])
)

print("\nCleaned Review Example:\n", df['review'].iloc[0])


# ------------------------------------------------------
# SPLIT DATA
# ------------------------------------------------------
X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTraining samples:", X_train.size)
print("Testing samples:", X_test.size)


# ------------------------------------------------------
# BAG OF WORDS
# ------------------------------------------------------
cv = CountVectorizer(max_features=10000)
X_train_cv = cv.fit_transform(X_train).toarray()
X_test_cv = cv.transform(X_test).toarray()

print("\nBOW Shape:", X_train_cv.shape)


# ------------------------------------------------------
# MODEL 1: NAIVE BAYES
# ------------------------------------------------------
gnb = GaussianNB()
gnb.fit(X_train_cv, y_train)
y_pred_gnb = gnb.predict(X_test_cv)

print("\nNaive Bayes Accuracy:", accuracy_score(y_test, y_pred_gnb))


# ------------------------------------------------------
# MODEL 2: SVM (RBF)
# ------------------------------------------------------
svc = SVC()
svc.fit(X_train_cv, y_train)
y_pred_svc = svc.predict(X_test_cv)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svc))


# ------------------------------------------------------
# MODEL 3: RANDOM FOREST
# ------------------------------------------------------
rf = RandomForestClassifier()
rf.fit(X_train_cv, y_train)
y_pred_rf = rf.predict(X_test_cv)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


# ------------------------------------------------------
# N-GRAM FEATURES (1-gram + 2-gram)
# ------------------------------------------------------
cv_ngram = CountVectorizer(ngram_range=(1, 2), max_features=10000)
X_train_ng = cv_ngram.fit_transform(X_train).toarray()
X_test_ng = cv_ngram.transform(X_test).toarray()

rf_ng = RandomForestClassifier()
rf_ng.fit(X_train_ng, y_train)
y_pred_ng = rf_ng.predict(X_test_ng)

print("\nRandom Forest (N-Grams) Accuracy:", accuracy_score(y_test, y_pred_ng))


Inbuilt IMDB dataset loaded!
Train size: 25000
Test size: 25000
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step

DataFrame created from inbuilt dataset.
                                              review  sentiment
0  ? this film was just brilliant casting locatio...          1
1  ? big hair big boobs bad music and a giant saf...          0
2  ? this has to be one of the worst films of the...          0
3  ? the ? ? at storytelling the traditional sort...          1
4  ? worst mistake of my life br br i picked this...          0

Cleaned Review Example:
 ? film just brilliant casting location scenery story direction everyone's really suited played just imagine robert ? amazing actor director ? father came scottish island loved fact real connection film witty remarks film great just brilliant bought film soon released ? recommend watch fly fishing a