In [77]:
import os
import json
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import re
from sklearn.metrics import accuracy_score
import joblib
import seaborn as sns
import matplotlib.pyplot as plt


# Download the necessary NLTK data files
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

def process_data(data: pd.DataFrame) -> pd.DataFrame: 
    data["review"] = data["review"].str.lower()
    data["review"] = data["review"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s$!,?']", "", x))
    data["review"] = data["review"].apply(lambda x: re.sub(r"\b(not|no|never|none|nothing|nobody|nowhere|hardly|scarcely|barely) ([a-z]+)", r"\1_\2", x))
    data["review"] = data["review"].apply(word_tokenize)

    # Define a set of English stop words and remove them from the tokens
    stop_words = set(stopwords.words("english"))
    data["review"] = data["review"].apply(lambda x: [word for word in x if word not in stop_words])

    lemmatizer = WordNetLemmatizer()
    data["review"] = data["review"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    data["review"] = data["review"].apply(lambda x: " ".join(x))
    data["sentiment"] = data["sentiment"].apply(lambda x: 0 if x == "positive" else 1)
    # Remove duplicates if any
    data = data.drop_duplicates("review")

    return data 

df = process_data(pd.read_csv("IMDB Dataset.csv"))


# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# svc = LinearSVC(C = 1, loss="squared_hinge")
svc = LinearSVC(C=1, loss='hinge')
# logreg = LogisticRegression()


# Transform the review data to feature vectors
X = vectorizer.fit_transform(df['review'])
Y = df["sentiment"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

svc.fit(X_train, Y_train)
# # C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# # # Fix the param_grid (use 'classifier__C' instead of just 'C')
# # param_grid = {
# #     "C": C_values  # Prefix with 'classifier__'
# # }
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'loss': ['hinge', 'squared_hinge']
# }

# grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
# grid = GridSearchCV(svc, param_grid, refit=True, verbose=3)
# grid.fit(X, y)



pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", svc)
])

# print("Best parameters:", grid.best_params_)
# print("Best Accuracy:", grid.best_score_)


model_filename = 'movie_review_model.joblib'
joblib.dump(pipeline, model_filename)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['movie_review_model.joblib']

In [79]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
model = joblib.load('movie_review_model.joblib')

print(f"Model saved to {model_filename}")


# Predict with the trained classifier
predictions = model.named_steps["classifier"].predict(X_test)

accuracy = accuracy_score(predictions, Y_test)
print("Test Accuracy:", accuracy)


Model saved to movie_review_model.joblib
Test Accuracy: 0.899623504101116
