In [3]:
%%time
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess dataset
df = pd.read_csv("IMDBDataset_50000.csv").rename(columns={"review": "text", "sentiment": "label"})
df['label'] = df['label'].map({"positive": 1, "negative": 0})
df['text'] = df['text'].str.lower().apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Define and fit the vectorizer globally
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Train model using Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.89
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.89      6157
           1       0.88      0.91      0.89      6343

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500

CPU times: total: 1min 6s
Wall time: 1min 3s


In [14]:
# Function to predict sentiment and confidence for multiple reviews
def predict_bulk_sentiments(reviews):
    cleaned_reviews = [re.sub(r'[^\w\s]', '', review.lower()) for review in reviews]  # Inline preprocessing
    transformed_reviews = vectorizer.transform(cleaned_reviews)  # Uses the global vectorizer
    
    # Get prediction probabilities
    probabilities = model.predict_proba(transformed_reviews)
    
    # Get predicted class (1 = Positive, 0 = Negative)
    predictions = model.predict(transformed_reviews)
    
    # Convert predictions to readable labels
    results = ["Positive" if pred == 1 else "Negative" for pred in predictions]
    
    # Print results with confidence
    for review, sentiment, prob in zip(reviews, results, probabilities):
        confidence = max(prob) * 100  # Convert to percentage
        print(f"Review: {review}")
        print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.2f}%)\n")

# List of new reviews to test
new_reviews = [
    "The movie was an absolute masterpiece!",
    "I hated this film. It was boring and predictable.",
    "A fantastic experience! I loved every second of it.",
    "Not worth the time. Completely disappointing.",
    "This was one of the best performances I've seen in a while.",
    "Terrible script and bad acting ruined the whole experience.",
    "A visually stunning film with a great story.",
    "The worst movie I have ever seen, total waste of time!",
    "It was okay, not the best, but not the worst either.",
    "An unforgettable journey through emotions. Loved it!",
    "Not the best movie to be honest",
    "Nothing good with this movie",
    "I like this movie, I think it is worth to watch",
    "Totally recommend this"
]

# Run bulk predictions
predict_bulk_sentiments(new_reviews)


Review: The movie was an absolute masterpiece!
Predicted Sentiment: Positive (Confidence: 64.68%)

Review: I hated this film. It was boring and predictable.
Predicted Sentiment: Negative (Confidence: 96.71%)

Review: A fantastic experience! I loved every second of it.
Predicted Sentiment: Positive (Confidence: 96.15%)

Review: Not worth the time. Completely disappointing.
Predicted Sentiment: Negative (Confidence: 74.35%)

Review: This was one of the best performances I've seen in a while.
Predicted Sentiment: Positive (Confidence: 96.50%)

Review: Terrible script and bad acting ruined the whole experience.
Predicted Sentiment: Negative (Confidence: 98.54%)

Review: A visually stunning film with a great story.
Predicted Sentiment: Positive (Confidence: 96.57%)

Review: The worst movie I have ever seen, total waste of time!
Predicted Sentiment: Negative (Confidence: 99.84%)

Review: It was okay, not the best, but not the worst either.
Predicted Sentiment: Negative (Confidence: 86.80%)



In [15]:
import pickle

# Save the trained model
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer
with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved!")


Model and vectorizer saved!
