# IMDb Movie Review Sentiment Analysis - Part 5: Predict New Reviews

## Overview
This notebook demonstrates how to use the trained model to predict sentiment for new movie reviews.

## Steps:
1. Load trained model and vectorizers
2. Preprocess new review text
3. Make predictions
4. Get prediction probabilities


## Step 1: Import Libraries and Load Preprocessing Functions


In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import pickle
import re

# Text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Initialize text preprocessing components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """Clean text by removing HTML tags, special characters, and extra whitespace"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

def preprocess_text(text):
    """Complete text preprocessing pipeline"""
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

print("Libraries and preprocessing functions loaded!")


## Step 2: Load Trained Model and Vectorizers


In [None]:
# Load TF-IDF vectorizer
with open('models/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load feature scaler
with open('models/feature_scaler.pkl', 'rb') as f:
    feature_scaler = pickle.load(f)

# Load label encoder
with open('models/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Load best model
with open('models/best_model.pkl', 'rb') as f:
    model = pickle.load(f)

print("Model and vectorizers loaded successfully!")
print(f"Model type: {type(model).__name__}")
print(f"Label classes: {label_encoder.classes_}")


## Step 3: Prediction Function


In [None]:
def predict_sentiment(review_text):
    """
    Predict sentiment for a given review text
    
    Parameters:
    -----------
    review_text : str
        The movie review text to analyze
    
    Returns:
    --------
    dict : Dictionary containing prediction, probability, and confidence
    """
    # Preprocess the text
    cleaned_text = preprocess_text(review_text)
    
    # Extract textual features
    char_count = len(cleaned_text)
    word_count = len(cleaned_text.split())
    avg_word_length = char_count / (word_count + 1) if word_count > 0 else 0
    exclamation_count = cleaned_text.count('!')
    question_count = cleaned_text.count('?')
    
    # Transform text using TF-IDF
    text_tfidf = tfidf_vectorizer.transform([cleaned_text])
    
    # Scale textual features
    textual_features = np.array([[char_count, word_count, avg_word_length, 
                                  exclamation_count, question_count]])
    textual_features_scaled = feature_scaler.transform(textual_features)
    
    # Combine features
    from scipy.sparse import hstack
    features = hstack([text_tfidf, textual_features_scaled])
    
    # Make prediction
    prediction = model.predict(features)[0]
    sentiment = label_encoder.inverse_transform([prediction])[0]
    
    # Get prediction probability if available
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(features)[0]
        prob_dict = {label_encoder.classes_[i]: probabilities[i] 
                    for i in range(len(label_encoder.classes_))}
        confidence = max(probabilities)
    else:
        prob_dict = None
        confidence = None
    
    return {
        'sentiment': sentiment,
        'prediction': prediction,
        'probabilities': prob_dict,
        'confidence': confidence,
        'original_text': review_text,
        'cleaned_text': cleaned_text
    }

print("Prediction function created!")


## Step 4: Predict Sentiment for Sample Reviews


In [None]:
# Sample reviews for testing
sample_reviews = [
    "This movie is absolutely fantastic! The acting was superb and the storyline kept me engaged throughout. Highly recommended!",
    "I was really disappointed with this film. The plot was confusing and the characters were poorly developed. Not worth watching.",
    "The movie was okay. Nothing special, but not terrible either. It's a decent watch if you have nothing else to do.",
    "Amazing cinematography and brilliant performances by all actors. This is one of the best movies I've seen this year!",
    "Terrible movie. Boring plot, bad acting, and a complete waste of time. I would not recommend this to anyone."
]

print("="*80)
print("PREDICTING SENTIMENT FOR SAMPLE REVIEWS")
print("="*80)

for i, review in enumerate(sample_reviews, 1):
    result = predict_sentiment(review)
    
    print(f"\nReview {i}:")
    print(f"Text: {review[:100]}...")
    print(f"Predicted Sentiment: {result['sentiment'].upper()}")
    if result['probabilities']:
        print(f"Probabilities: {result['probabilities']}")
        print(f"Confidence: {result['confidence']:.2%}")
    print("-" * 80)


## Step 5: Interactive Prediction


In [None]:
# Enter your own review here
new_review = input("Enter a movie review to analyze: ")

if new_review.strip():
    result = predict_sentiment(new_review)
    
    print("\n" + "="*80)
    print("PREDICTION RESULT")
    print("="*80)
    print(f"\nReview: {new_review}")
    print(f"\nPredicted Sentiment: {result['sentiment'].upper()}")
    
    if result['probabilities']:
        print(f"\nConfidence: {result['confidence']:.2%}")
        print(f"\nDetailed Probabilities:")
        for sentiment, prob in result['probabilities'].items():
            print(f"  {sentiment.capitalize()}: {prob:.2%}")
    
    print("="*80)
else:
    print("No review entered.")


## Step 6: Batch Prediction from File


In [None]:
# Example: Predict sentiment for multiple reviews from a CSV file
# Uncomment and modify the code below to use your own file

"""
# Load reviews from CSV file
reviews_df = pd.read_csv('path/to/your/reviews.csv')

# Make predictions
predictions = []
for review in reviews_df['review']:
    result = predict_sentiment(review)
    predictions.append(result['sentiment'])

# Add predictions to dataframe
reviews_df['predicted_sentiment'] = predictions

# Save results
reviews_df.to_csv('predictions_results.csv', index=False)
print(f"Predictions saved for {len(reviews_df)} reviews!")
"""


## Summary

### Key Features:
1. ✅ Load trained model and preprocessing components
2. ✅ Preprocess new review text
3. ✅ Make sentiment predictions
4. ✅ Get prediction probabilities and confidence scores
5. ✅ Support for single and batch predictions

### Usage Tips:
- The model works best with reviews similar to the training data
- Longer reviews generally provide better predictions
- The confidence score indicates how certain the model is about its prediction
- Lower confidence scores may indicate ambiguous reviews
