In [4]:
%reload_ext autoreload
%autoreload 2

import os
import sys
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import find_dotenv, load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))

# CountVectorizer + MultinomialNB

In [9]:
""" MODEL TRAINING """
mlflow.set_tracking_uri("./mlruns")  # Set tracking server (aka file)
mlflow.set_experiment("Sentiment Classification")  # Set experiment
run_name = "countvec-multinomialnb-run"  # set run name
artifact_path = "countvec-multinomialnb"  # set artifact path

# load Data
df = pd.read_csv("../data/processed/sample-clean-data.csv")

# Split Data
df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0}) 
X, y = df.loc[:, ~df.columns.isin(['sentiment'])], df[['sentiment']]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, shuffle=True, random_state=42)

# Modelling
params = {}  # means using default params
model = Pipeline([
    ('CountVectorizer', CountVectorizer()),
    ('MultinomialNB', MultinomialNB(**params))
])
model.fit(X_train['preprocessed_review_text'], y_train['sentiment'])
signature = mlflow.models.infer_signature(X_train[['preprocessed_review_text']], y_train)

# Evaluate Model
metrics = {
    "training_precision_score": precision_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_recall_score": recall_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_f1_score": f1_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "validation_precision_score": precision_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_recall_score": recall_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_f1_score": f1_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "test_precision_score": precision_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_recall_score": recall_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_f1_score": f1_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text']))
}

with mlflow.start_run(run_name=run_name):
    # log hyperparameters
    mlflow.log_params(params)

    # log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        signature=signature
    )

2024/02/29 17:20:34 INFO mlflow.tracking.fluent: Experiment with name 'Sentiment Classification' does not exist. Creating a new experiment.


# TfidfVectorizer + MultinomialNB

In [10]:
""" MODEL TRAINING """
mlflow.set_tracking_uri("./mlruns")  # Set tracking server (aka file)
mlflow.set_experiment("Sentiment Classification")  # Set experiment
run_name = "tfidf-multinomialnb-run"  # set run name
artifact_path = "tfidf-multinomialnb"  # set artifact path

# load Data
df = pd.read_csv("../data/processed/sample-clean-data.csv")

# Split Data
df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0}) 
X, y = df.loc[:, ~df.columns.isin(['sentiment'])], df[['sentiment']]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, shuffle=True, random_state=42)

# Modelling
params = {}  # means using default params
model = Pipeline([
    ('TfidfVectorizer', TfidfVectorizer()),
    ('MultinomialNB', MultinomialNB(**params))
])
model.fit(X_train['preprocessed_review_text'], y_train['sentiment'])
signature = mlflow.models.infer_signature(X_train[['preprocessed_review_text']], y_train)

# Evaluate Model
metrics = {
    "training_precision_score": precision_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_recall_score": recall_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_f1_score": f1_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "validation_precision_score": precision_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_recall_score": recall_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_f1_score": f1_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "test_precision_score": precision_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_recall_score": recall_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_f1_score": f1_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text']))
}

with mlflow.start_run(run_name=run_name):
    # log hyperparameters
    mlflow.log_params(params)

    # log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        signature=signature
    )

