In [49]:
import os
import json
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import re
from sklearn.metrics import accuracy_score
import joblib


# df = pd.read_json("skills_assessment_data/train.json")

# Download the necessary NLTK data files
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

def process_data(data: pd.DataFrame) -> pd.DataFrame: 
    # Remove duplicates if any
    data = data.drop_duplicates()

    data["text"] = data["text"].str.lower()
    data["text"] = data["text"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))
    data["text"] = data["text"].apply(word_tokenize)

    # Define a set of English stop words and remove them from the tokens
    stop_words = set(stopwords.words("english"))
    data["text"] = data["text"].apply(lambda x: [word for word in x if word not in stop_words])

    stemmer = PorterStemmer()
    data["text"] = data["text"].apply(lambda x: [stemmer.stem(word) for word in x])
    data["text"] = data["text"].apply(lambda x: " ".join(x))
    data["label"] = data["label"].apply(lambda x: 0 if x == 1 else 1)
    data.drop_duplicates()
    return data 

df = process_data(pd.read_json("skills_assessment_data/train.json"))

# # Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
# vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))
# # Fit and transform the message column
# X = vectorizer.fit_transform(df["text"])

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
logreg = LogisticRegression()

# Transform the text data to feature vectors
X = vectorizer.fit_transform(df['text'])

y = df["label"]
logreg.fit(X, y)

pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", logreg)
])



model_filename = 'movie_review_model.joblib'
joblib.dump(pipeline, model_filename)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oscarotterstad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['movie_review_model.joblib']

In [50]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
model = joblib.load('movie_review_model.joblib')

print(f"Model saved to {model_filename}")

df_test = process_data(pd.read_json("skills_assessment_data/test.json"))
X_test = model.named_steps["vectorizer"].transform(df_test["text"])
y_test = df_test["label"]
# Predict with the trained classifier
predictions = model.named_steps["classifier"].predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy:", accuracy)


Model saved to movie_review_model.joblib
Test Accuracy: 0.8743195838877464
