In [1]:
import re
import joblib
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
dataset = load_dataset("imdb")
texts = dataset['train']['text'] + dataset['test']['text']
labels = dataset['train']['label'] + dataset['test']['label']

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)  # Remove HTML line breaks
    text = re.sub(r"[^a-z\s]", "", text)    # Remove punctuation/numbers
    return text

In [4]:
texts = [preprocess(t) for t in texts]

In [5]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X = vectorizer.fit_transform(texts)
y = np.array(labels)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
clf = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

In [8]:
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [9]:
print(f"Test Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

Test Accuracy: 0.8478
F1 Score: 0.8499


In [10]:
joblib.dump(clf, "RF_model.joblib")

['RF_model.joblib']