# 1. Imports & Setup

In [6]:
import re, unicodedata, hashlib, html, string, pandas as pd, numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix 
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras import layers 
from bs4 import BeautifulSoup 

RANDOM_SEED = 42 
np.random.seed(RANDOM_SEED) 
tf.random.set_seed(RANDOM_SEED)

In [4]:
IN_PATH = "dataset/IMDB Dataset.csv" # Kaggle file 
OUT_PATH = "dataset/imdb_clean_split.csv" # Shared artifact 

# 2. Clean Text


In [7]:
def clean_text(x):
    if not isinstance(x, str):
        return ""
    # Unescape & strip HTML
    x = html.unescape(x)
    x = BeautifulSoup(x, "lxml").get_text(separator=" ")

    # Unicode normalize + unify curly quotes to straight ones
    x = unicodedata.normalize("NFKC", x)
    x = x.replace("“", "'").replace("”", "'").replace("‘", "'").replace("’", "'").replace('"', "'")

    # Neutralize obvious artifacts
    x = re.sub(r"(https?://\S+)|(\w+\.\w+/\S+)", " ", x)
    x = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", " ", x)

    # runs of 2+ asterisks → single *
    x = re.sub(r"\{2,}", "", x)

    # collapse any run of -, – or — to a single em dash, with spacing
    x = re.sub(r"\s*[-–—]{2,}\s*", " — ", x)

    # "" → "   and   '' → '
    x = re.sub(r'([\'\"])\1+', r'\1', x)  # collapse immediate repeats
    # also clean cases with whitespace between repeated quotes: "  " → "
    x = re.sub(r'([\'"])\s+\1', r'\1', x)

    # cap !!!!! or ????? at two; dots at an ellipsis
    x = re.sub(r"([!?])\1{2,}", r"\1\1", x)   # keep at most two
    x = re.sub(r"\.{3,}", "…", x)

    # 5) Remove control chars & collapse whitespace
    x = re.sub(r"[\u0000-\u001F\u007F]", " ", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x

# 3. Read Database

In [6]:
df = pd.read_csv(IN_PATH)  # columns: review, sentiment
df["review_clean"] = df["review"].apply(clean_text)
df["label"] = (df["sentiment"].str.lower() == "positive").astype(int)

# drop null/empty data first
df = df.dropna(subset=['review_clean']).copy()  # drop rows where review_clean is null
# deduplicate (after cleaning) - keep first occurrence
def row_key(s): return hashlib.sha1(s.encode("utf-8")).hexdigest()
df["dup_key"] = df["review_clean"].apply(row_key)
df = df.drop_duplicates(subset=["dup_key"], keep='first').drop(columns=["dup_key"]).reset_index(drop=True)

# 4. Train/Validate/Test Spilt

In [7]:
X = df["review_clean"].values
y = df["label"].values

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# Second split: from the 80%, split into 70% train (87.5% of temp) and 10% val (12.5% of temp)
# 0.125 = 10% / 80% (to get 10% of original data)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, stratify=y_temp, random_state=42
)

# Create split column
df["split"] = "train"  # default
df.loc[df["review_clean"].isin(X_test), "split"] = "test"
df.loc[df["review_clean"].isin(X_val), "split"] = "val"

# basic EDA
print(df["split"].value_counts())
print(df["label"].value_counts())

df["n_chars"] = df["review_clean"].str.len()
df["n_words"] = df["review_clean"].str.split().apply(len)
print(df[["n_chars","n_words"]].describe(percentiles=[.5,.75,.9,.95,.99]))

df = df.rename(columns={"review_clean":"text"})
df[["text","label","split"]].to_csv(OUT_PATH, index=False)
print("Saved:",OUT_PATH)

split
train    34704
test      9916
val       4958
Name: count, dtype: int64
label
1    24882
0    24696
Name: count, dtype: int64
            n_chars       n_words
count  49578.000000  49578.000000
mean    1286.624249    229.274013
std      972.995822    169.952820
min       32.000000      4.000000
50%      954.000000    172.000000
75%     1560.000000    278.000000
90%     2532.000000    448.000000
95%     3334.000000    585.150000
99%     5100.460000    898.230000
max    13593.000000   2459.000000
Saved: dataset/imdb_clean_split.csv


# 5. TF-IDF


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer 
import pickle, math, os 

# ----- TF-IDF (fit on train only) ----- 
tfidf = TfidfVectorizer( 
    max_features=30000, # keep consistent across models for fair compare 
    ngram_range=(1, 2), 
    min_df=2, 
    max_df=0.95, 
    sublinear_tf=True, 
    lowercase=True, 
    strip_accents="unicode"
) 
tfidf.fit(X_train) 

X_train_tf = tfidf.transform(X_train).astype(np.float32) # sparse CSR 
X_val_tf = tfidf.transform(X_val).astype(np.float32) 
X_test_tf = tfidf.transform(X_test).astype(np.float32) 
N_FEATS = X_train_tf.shape[1] 
print("TF-IDF features:", N_FEATS) 

# ----- Sparse→dense per-batch generator (saves RAM) ----- 
class CSRBatchGenerator(keras.utils.Sequence):
    def __init__(self, X_csr, y=None, batch_size=256, shuffle=True, **kwargs):
        # Keras 3 wants this so fit(workers=..., use_multiprocessing=...) can flow in.
        super().__init__(**kwargs)
        self.X = X_csr.astype(np.float32)
        self.y = y
        self.bs = batch_size
        self.shuffle = shuffle
        self.idx = np.arange(self.X.shape[0])
        self.on_epoch_end()

    def __len__(self):
        import math
        return math.ceil(self.X.shape[0] / self.bs)

    def __getitem__(self, i):
        sl = slice(i*self.bs, (i+1)*self.bs)
        ii = self.idx[sl]
        Xb = self.X[ii].toarray()  # dense only for this batch
        if self.y is None:
            return Xb
        return Xb, self.y[ii]

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.idx)


TF-IDF features: 30000


# 6. Build the ANN

In [9]:
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model

inputs = layers.Input(shape=(N_FEATS,), dtype="float32")
x = layers.Dense(128, activation="relu",
                 kernel_regularizer=regularizers.l2(1e-4))(inputs)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = Model(inputs, outputs)

model.compile(optimizer=keras.optimizers.Adam(3e-4),
              loss="binary_crossentropy", metrics=["accuracy"])
model.summary()


# 7. Train

In [10]:
BATCH_SIZE = 256
EPOCHS     = 12

train_gen = CSRBatchGenerator(X_train_tf, y_train, batch_size=BATCH_SIZE, shuffle=True)
val_gen   = CSRBatchGenerator(X_val_tf,   y_val,   batch_size=BATCH_SIZE, shuffle=False)

callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3, mode="max", restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=1),
]

history = model.fit(train_gen, validation_data=val_gen, epochs=EPOCHS, verbose=1)

Epoch 1/12
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 75ms/step - accuracy: 0.8282 - loss: 0.6233 - val_accuracy: 0.8737 - val_loss: 0.5256
Epoch 2/12
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 71ms/step - accuracy: 0.8913 - loss: 0.4583 - val_accuracy: 0.8923 - val_loss: 0.4150
Epoch 3/12
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 72ms/step - accuracy: 0.9085 - loss: 0.3782 - val_accuracy: 0.9006 - val_loss: 0.3694
Epoch 4/12
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 73ms/step - accuracy: 0.9206 - loss: 0.3381 - val_accuracy: 0.9056 - val_loss: 0.3473
Epoch 5/12
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 73ms/step - accuracy: 0.9283 - loss: 0.3126 - val_accuracy: 0.9066 - val_loss: 0.3346
Epoch 6/12
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 71ms/step - accuracy: 0.9354 - loss: 0.2962 - val_accuracy: 0.9086 - val_loss: 0.3261
Epoch 7/12
[1m1

# 8. Evaluate on Test

In [14]:
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    classification_report, confusion_matrix
)

test_gen = CSRBatchGenerator(X_test_tf, batch_size=512, shuffle=False)
proba = model.predict(test_gen).ravel()
y_pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
prec_w, rec_w, f1_w, _ = precision_recall_fscore_support(
    y_test, y_pred, average="weighted", zero_division=0
)

print("\n=== Final Test Metrics (Weighted) ===")
print(f"Accuracy        : {acc:.4f}")
print(f"Precision (Wgt) : {prec_w:.4f}")
print(f"Recall (Wgt)    : {rec_w:.4f}")
print(f"F1 (Weighted)   : {f1_w:.4f}")

print("\nClassification report (includes weighted avg row):")
print(classification_report(y_test, y_pred, target_names=["negative","positive"], digits=4, zero_division=0))

print("\nConfusion Matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test, y_pred))

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step

=== Final Test Metrics (Weighted) ===
Accuracy        : 0.9077
Precision (Wgt) : 0.9078
Recall (Wgt)    : 0.9077
F1 (Weighted)   : 0.9077

Classification report (includes weighted avg row):
              precision    recall  f1-score   support

    negative     0.9115    0.9024    0.9069      4939
    positive     0.9041    0.9130    0.9085      4977

    accuracy                         0.9077      9916
   macro avg     0.9078    0.9077    0.9077      9916
weighted avg     0.9078    0.9077    0.9077      9916


Confusion Matrix (rows=true, cols=pred):
[[4457  482]
 [ 433 4544]]


#  9. Save model & vectorizer

In [12]:
model.save("models/ann/imdb_mlp_tfidf.keras")
with open("models/ann/tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# 10. Predict the Sentiment

In [13]:
def predict_sentiment(text: str):
    text = clean_text(text)
    Xv = tfidf.transform([text]).astype(np.float32).toarray()
    p  = model.predict(Xv, verbose=0)[0, 0]
    label = "positive" if p >= 0.5 else "negative"
    return label, float(p)

print(predict_sentiment('"Fifty Shades of Grey" was painful to sit through. The chemistry between the leads was practically nonexistent, and the dialogue often felt laughable. What was supposed to be a story about passion and complexity ended up being dull and awkward. The pacing dragged, and scenes that were meant to be intense came across as cringe-worthy. It wasn’t romantic, it wasn’t sexy, and it wasn’t dramatic—it was just boring.'))
print(predict_sentiment('"Transformers: The Last Knight" was exhausting. It felt less like a movie and more like a two-and-a-half-hour commercial for explosions and CGI robots.'))

('negative', 0.018184518441557884)
('negative', 0.2542766034603119)


In [None]:
from huggingface_hub import HfApi
import os

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path=r"D:\ai-app\models\ann",
    repo_id="noobrs/ann-movie-sentiment",
    repo_type="model",
)


tfidf.pkl:   0%|          | 0.00/1.16M [00:00<?, ?B/s]
tfidf.pkl:  38%|███▊      | 442k/1.16M [00:00<00:00, 4.16MB/s]
tfidf.pkl:  98%|█████████▊| 1.13M/1.16M [00:00<00:00, 4.70MB/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
tfidf.pkl: 100%|██████████| 1.16M/1.16M [00:02<00:00, 508kB/s] 

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
imdb_mlp_tfidf.keras: 100%|██████████| 46.1M/46.1M [00:09<00:00, 5.12MB/s]
Upload 2 LFS files: 100%|██████████| 2/2 [00:09<00:00,  4.75s/it]


CommitInfo(commit_url='https://huggingface.co/noobrs/ann-movie-sentiment/commit/4136ecd4e05b8b8f166437273065159dc40f6799', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4136ecd4e05b8b8f166437273065159dc40f6799', pr_url=None, repo_url=RepoUrl('https://huggingface.co/noobrs/ann-movie-sentiment', endpoint='https://huggingface.co', repo_type='model', repo_id='noobrs/ann-movie-sentiment'), pr_revision=None, pr_num=None)

In [8]:
from huggingface_hub import hf_hub_download

SIDE_REPO = "noobrs/ann-movie-sentiment"  # your repo

def load_ann():
    try:
        tfidf_p = hf_hub_download(SIDE_REPO, filename="tfidf.pkl")
        ann_p   = hf_hub_download(SIDE_REPO, filename="imdb_mlp_tfidf.keras")
        import tensorflow as tf, joblib
        return {"model": tf.keras.models.load_model(ann_p),
                "tfidf": joblib.load(tfidf_p)}
    except Exception as e:
        return None
    
def predict_sentiment(text: str):
    model_data = load_ann()
    if model_data is None:
        return "error", 0.0
    model = model_data["model"]
    tfidf = model_data["tfidf"]

    text = clean_text(text)
    Xv = tfidf.transform([text]).astype(np.float32).toarray()
    p  = model.predict(Xv, verbose=0)[0, 0]
    label = "positive" if p >= 0.5 else "negative"
    return label, float(p)

print(predict_sentiment('"Fifty Shades of Grey" was painful to sit through. The chemistry between the leads was practically nonexistent, and the dialogue often felt laughable. What was supposed to be a story about passion and complexity ended up being dull and awkward. The pacing dragged, and scenes that were meant to be intense came across as cringe-worthy. It wasn’t romantic, it wasn’t sexy, and it wasn’t dramatic—it was just boring.'))
print(predict_sentiment('"Transformers: The Last Knight" was exhausting. It felt less like a movie and more like a two-and-a-half-hour commercial for explosions and CGI robots.'))

('negative', 0.018184518441557884)
('negative', 0.2542766034603119)
