In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('data/movie.csv')
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
# Function to load IMDb dataset
def load_data():
    df = pd.read_csv('data/movie.csv')
    return df['text'], df['label']

# Function to preprocess data (split into training and testing sets)
def preprocess_data(text, label):
    X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Function to vectorize text data using CountVectorizer
def vectorize_text(X_train, X_test):
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, vectorizer  # Return the vectorizer as well to test random text 

# Function to train a Naive Bayes classifier
def train_model(X_train_vec, y_train):
    classifier = MultinomialNB()
    classifier.fit(X_train_vec, y_train)
    return classifier

# Function to evaluate the trained model
def evaluate_model(classifier, X_test_vec, y_test):
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

# Main function
def main():
    # Step 1: Load data
    text, label = load_data()

    # Step 2: Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(text, label)

    # Step 3: Vectorize text data
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)  # Capture the vectorizer

    # Step 4: Train the model
    classifier = train_model(X_train_vec, y_train)

    # Step 5: Evaluate the model
    accuracy, report = evaluate_model(classifier, X_test_vec, y_test)

    # Display results
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", report)

    # Test random text with the trained model
    test_text = ["This movie was fantastic!", "I didn't like the plot."]
    test_text_vec = vectorizer.transform(test_text)
    predictions = classifier.predict(test_text_vec)
    print("\nTest Text Predictions:", predictions)

if __name__ == "__main__":
    main()

Accuracy: 0.85
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      3966
           1       0.88      0.82      0.85      4034

    accuracy                           0.85      8000
   macro avg       0.85      0.85      0.85      8000
weighted avg       0.85      0.85      0.85      8000


Test Text Predictions: [1 0]
