In [2]:
import re
import joblib
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [4]:
dataset = load_dataset("imdb")
texts = dataset['train']['text'] + dataset['test']['text']
labels = dataset['train']['label'] + dataset['test']['label']

In [5]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)  # Remove HTML breaks
    text = re.sub(r"[^a-z\s]", "", text)    # Remove punctuation/numbers
    return text

In [6]:
texts = [preprocess(t) for t in texts]

In [7]:
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = vectorizer.fit_transform(texts)
y = np.array(labels)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
clf = LogisticRegression(max_iter=300)
clf.fit(X_train, y_train)

In [10]:
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [11]:
print(f"Test Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

Test Accuracy: 0.9062
F1 Score: 0.9070


In [13]:
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(clf, "lr_model.joblib")

['lr_model.joblib']