In [None]:
import os
import json
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import re
import joblib


# df = pd.read_json("skills_assessment_data/train.json")

# Download the necessary NLTK data files
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

def process_data(data: pd.DataFrame) -> pd.DataFrame: 
    # Remove duplicates if any
    data = data.drop_duplicates()

    data["text"] = data["text"].str.lower()
    data["text"] = data["text"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))
    data["text"] = data["text"].apply(word_tokenize)

    # Define a set of English stop words and remove them from the tokens
    stop_words = set(stopwords.words("english"))
    data["text"] = data["text"].apply(lambda x: [word for word in x if word not in stop_words])

    stemmer = PorterStemmer()
    data["text"] = data["text"].apply(lambda x: [stemmer.stem(word) for word in x])
    data["text"] = data["text"].apply(lambda x: " ".join(x))
    data["label"] = data["label"].apply(lambda x: 0 if x == 1 else 1)
    return data 

df = process_data(pd.read_json("skills_assessment_data/train.json"))

# Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))

# Fit and transform the message column
X = vectorizer.fit_transform(df["text"])

y = df["label"] 

pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", MultinomialNB())
])

print(df.head())

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0]
}

# # Perform the grid search with 5-fold cross-validation and the F1-score as metric
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1"
)

# # Fit the grid search on the full dataset
grid_search.fit(df["text"], y)

# Extract the best model identified by the grid search
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)



# Save the trained model to a file for future use
model_filename = 'movie_review_model.joblib'
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")


new_reviews = [
    "The movie did not entertain me. I barely felt any joy watching this.",
    "Star Wars always get a kick out of me. I really loved this movie.",
    "Super boring movie.",
    "Classy, elegant, joyful. Home Alone always get me in the christmas spirit.",
    "The CGI in The Hobit was disgusting! How could they go from Lord of The Rings to this?",
]

# Preprocess function that mirrors the training-time preprocessing
def preprocess_reviews(review):
    review = review.lower()
    review = re.sub(r"[^a-z\s$!]", "", review)
    tokens = word_tokenize(review)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

processed_reviews = [preprocess_reviews(msg) for msg in new_reviews]

X_new = best_model.named_steps["vectorizer"].transform(processed_reviews)
# Predict with the trained classifier
predictions = best_model.named_steps["classifier"].predict(X_new)
prediction_probabilities = best_model.named_steps["classifier"].predict_proba(X_new)

# Display predictions and probabilities for each evaluated message
for i, msg in enumerate(new_reviews):
    prediction = "Negative" if predictions[i] == 1 else "Positive"
    postitive_probability = prediction_probabilities[i][0]  # Probability of being spam
    negative_probability = prediction_probabilities[i][1]   # Probability of being not spam
    
    print(f"Message: {msg}")
    print(f"Prediction: {prediction}")
    print(f"Positive Probability: {postitive_probability:.2f}")
    print(f"Negative Probability: {negative_probability:.2f}")
    print("-" * 50)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  label
0  bromwel high cartoon comedi ran time program s...      1
1  homeless houseless georg carlin state issu yea...      1
2  brilliant overact lesley ann warren best drama...      1
3  easili underr film inn brook cannon sure flaw ...      1
4  typic mel brook film much less slapstick movi ...      1
Best model parameters: {'classifier__alpha': 1.0}
Model saved to movie_review_model.joblib
Message: The movie did not entertain me. I barely felt any joy watching this.
Prediction: Positive
Positive Probability: 0.66
Negative Probability: 0.34
--------------------------------------------------
Message: Star Wars always get a kick out of me. I really loved this movie.
Prediction: Positive
Positive Probability: 1.00
Negative Probability: 0.00
--------------------------------------------------
Message: Super boring movie.
Prediction: Negative
Positive Probability: 0.01
Negative Probability: 0.99
----------------------------------------

In [33]:
from sklearn.metrics import accuracy_score

model = joblib.load('movie_review_model.joblib')
df_test = process_data(pd.read_json("skills_assessment_data/test.json"))
X_test = best_model.named_steps["vectorizer"].transform(df_test["text"])
y_test = df_test["label"]
# Predict with the trained classifier
predictions = best_model.named_steps["classifier"].predict(X_test)
prediction_probabilities = best_model.named_steps["classifier"].predict_proba(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.8483932099512116
