In [9]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [10]:

def load_imdb_data(base_path, subset='train'):
    data = []
    labels = []
    for label_type in ['pos', 'neg']:
        dir_path = os.path.join(base_path, subset, label_type)
        for file_name in os.listdir(dir_path):
            if file_name.endswith(".txt"):
                with open(os.path.join(dir_path, file_name), 'r', encoding='utf-8') as f:
                    data.append(f.read())
                    labels.append(label_type)
    return pd.DataFrame({'review': data, 'sentiment': labels})

# Load train and test data
train_df = load_imdb_data("aclImdb", subset='train')
test_df = load_imdb_data("aclImdb", subset='test')

# Combine both if needed
df = pd.concat([train_df, test_df], ignore_index=True)

In [15]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review,label
0,Bromwell High is a cartoon comedy. It ran at t...,pos,bromwell high is a cartoon comedy it ran at th...,1
1,Homelessness (or Houselessness as George Carli...,pos,homelessness or houselessness as george carlin...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,pos,brilliant overacting by lesley ann warren best...,1
3,This is easily the most underrated film inn th...,pos,this is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,pos,this is not the typical mel brooks film it was...,1


In [11]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove symbols
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

df['cleaned_review'] = df['review'].apply(clean_text)


In [12]:
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_review'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

In [27]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000, stop_words="english")
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [28]:
print("Training model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vect, y_train)

Training model...


In [29]:
y_pred = model.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

Accuracy: 0.8838

Classification Report:
               precision    recall  f1-score   support

    Negative       0.89      0.87      0.88      5000
    Positive       0.88      0.90      0.89      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [30]:
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Model and vectorizer saved!")

Model and vectorizer saved!


In [31]:
def predict_sentiment(text):
    text = clean_text(text)
    vector = vectorizer.transform([text])
    prediction = model.predict(vector)[0]
    return "Positive" if prediction == 1 else "Negative"

In [32]:

print("\nTest Prediction:")
print(predict_sentiment("This movie was absolutely fantastic and thrilling!"))


Test Prediction:
Positive
