In [33]:
import pandas as pd

# Load the dataset
df = pd.read_json(
    "skills_assessment_data/train.json",
)
df = df.drop_duplicates()

In [34]:
df

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0


In [35]:
import nltk

In [36]:
df["text"] = df["text"].str.lower()

In [42]:
import re
df["text"] = df["text"].apply(lambda x: re.sub(r"[^a-z\s]", "", x))

In [37]:
from nltk.tokenize import word_tokenize
df["text"] = df["text"].apply(word_tokenize)

In [38]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
df["text"] = df["text"].apply(lambda x: [word for word in x if word not in stop_words])

In [39]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df["text"] = df["text"].apply(lambda x: [stemmer.stem(word) for word in x])


In [40]:
df["text"] = df["text"].apply(lambda x: " ".join(x))

In [43]:
print(df["text"].head(5))

0    bromwel high cartoon comedi  ran time program ...
1    homeless  houseless georg carlin state  issu y...
2    brilliant overact lesley ann warren  best dram...
3    easili underr film inn brook cannon  sure  fla...
4    typic mel brook film  much less slapstick movi...
Name: text, dtype: object


In [44]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))

# Fit and transform the message column
X = vectorizer.fit_transform(df["text"])

# Labels (target variable)
y = df["label"]  # Converting labels to 1 and 0


In [47]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Build the pipeline by combining vectorization and classification
pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", MultinomialNB())
])


In [48]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0]
}

# Perform the grid search with 5-fold cross-validation and the F1-score as metric
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1"
)

# Fit the grid search on the full dataset
grid_search.fit(df["text"], y)

# Extract the best model identified by the grid search
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)


Best model parameters: {'classifier__alpha': 1.0}


In [50]:
# Load the dataset
test_df = pd.read_json(
    "skills_assessment_data/test.json",
)
test_df = test_df.drop_duplicates()

In [51]:
import numpy as np
import re

# Preprocess function that mirrors the training-time preprocessing
def preprocess_message(message):
    message = message.lower()
    message = re.sub(r"[^a-z\s]", "", message)
    tokens = word_tokenize(message)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

In [52]:
processed_messages = [preprocess_message(msg) for msg in test_df["text"]]


In [53]:
processed_messages

['went saw movi last night coax friend mine ill admit reluct see knew ashton kutcher abl comedi wrong kutcher play charact jake fischer well kevin costner play ben randal profession sign good movi toy emot one exactli entir theater sold overcom laughter first half movi move tear second half exit theater saw mani women tear mani full grown men well tri desper let anyon see cri movi great suggest go see judg',
 'actor turn director bill paxton follow promis debut gothichorror frailti famili friendli sport drama us open young american caddi rise humbl background play bristish idol dub greatest game ever play im fan golf scrappi underdog sport flick dime dozen recent done grand effect miracl cinderella man film enthral samebr br film start creativ open credit imagin disneyfi version anim open credit hbo carnival rome lumber along slowli first bythenumb hour action move us open thing pick well paxton nice job show knack effect directori flourish love rainsoak montag action day two open prop

In [56]:
# Transform preprocessed messages into feature vectors
X_new = best_model.named_steps["vectorizer"].transform(processed_messages)

In [57]:
# Predict with the trained classifier
predictions = best_model.named_steps["classifier"].predict(X_new)
prediction_probabilities = best_model.named_steps["classifier"].predict_proba(X_new)

In [59]:

spam_probability = prediction_probabilities[0][1]  # Probability of being spam
ham_probability = prediction_probabilities[0][0]   # Probability of being not spam
    
print(f"Message: {msg}")
print(f"Spam Probability: {spam_probability:.2f}")
print(f"Not-Spam Probability: {ham_probability:.2f}")
print("-" * 50)


Message: David Bryce's comments nearby are exceptionally well written and informative as almost say everything I feel about DARLING LILI. This massive musical is so peculiar and over blown, over produced and must have caused ruptures at Paramount in 1970. It cost 22 million dollars! That is simply irresponsible. DARLING LILI must have been greenlit from a board meeting that said "hey we got that Pink Panther guy and that Sound Of Music gal... lets get this too" and handed over a blank cheque. The result is a hybrid of GIGI, ZEPPELIN, HALF A SIXPENCE, some MGM 40s song and dance numbers of a style (daisies and boaters!) so hopelessly old fashioned as to be like musical porridge, and MATA HARI dramatics. The production is colossal, lush, breathtaking to view, but the rest: the ridiculous romance, Julie looking befuddled, Hudson already dead, the mistimed comedy, and the astoundingly boring songs deaden this spectacular film into being irritating. LILI is like a twee 1940s mega musical wi

In [None]:
import joblib

# Save the trained model to a file for future use
model_filename = 'skills_assessment.joblib'
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")
