In [2]:
%reload_ext autoreload
%autoreload 2

import os
import sys
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import find_dotenv, load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))

# CountVectorizer + MultinomialNB

In [7]:
0.3 / 2

0.15

In [18]:
import logging
import pandas as pd
from typing import Union, Tuple

class Logger:
    def __init__(self, logger_name: str) -> None:
        log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
        logging.basicConfig(level=logging.INFO, format=log_fmt)
        self.logger_name = logger_name

    def get_logger(self) -> logging.Logger:
        self.logger = logging.getLogger(self.logger_name)
        return self.logger


import logging
import sys
import os
import pandas as pd
from typing import Union, Tuple
from dotenv import find_dotenv, load_dotenv
from sklearn.model_selection import train_test_split

load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.logger import Logger


class DataSplitting:
    def __init__(self, df: pd.DataFrame, test_size: float = 0.3):
        """Initialize `DataSplitting` class

        Args:
            df (pd.DataFrame): dataset
            test_size (float): test size proportion
        """
        self.df = df
        self.test_size = test_size
        self.logger = Logger(__name__).get_logger()

    def train_val_test_split(self) -> Union[Tuple[pd.DataFrame], Tuple[pd.Series]]:
        """Split data into train, validation and test

        Returns:
            Union[Tuple[pd.DataFrame], Tuple[pd.Series]]: splitted dataset
        """
        self.logger.info(f"Splitting Data ...")
        X, y = (
            self.df.loc[:, ~self.df.columns.isin(["sentiment"])],
            self.df[["sentiment"]],
        )
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y, test_size=self.test_size, shuffle=True, random_state=42
        )  # split main data into training and test
        X_val, X_test, y_val, y_test = train_test_split(
            X_test,
            y_test,
            stratify=y_test,
            test_size=0.5,
            shuffle=True,
            random_state=42,
        )  # split test data into validation and test
        return X_train, X_val, X_test, y_train, y_val, y_test

In [19]:
""" MODEL TRAINING """
# mlflow.set_tracking_uri("./mlruns")  # Set tracking server (aka file)
# mlflow.set_experiment("Sentiment Classification")  # Set experiment
# run_name = "countvec-multinomialnb-run"  # set run name
# artifact_path = "countvec-multinomialnb"  # set artifact path

# load Data
df = pd.read_csv("../data/processed/sample-clean-data.csv")

# Split Data
# X, y = df.loc[:, ~df.columns.isin(['sentiment'])], df[['sentiment']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, shuffle=True, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, shuffle=True, random_state=42)
X_train, X_val, X_test, y_train, y_val, y_test = DataSplitting(df, 0.3).train_val_test_split()

# Modelling
params = {}  # means using default params
model = Pipeline([
    ('CountVectorizer', CountVectorizer()),
    ('MultinomialNB', MultinomialNB(**params))
])
model.fit(X_train['preprocessed_review_text'], y_train['sentiment'])
# signature = mlflow.models.infer_signature(X_train[['preprocessed_review_text']], y_train)

# Evaluate Model
metrics = {
    "training_precision_score": precision_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_recall_score": recall_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_f1_score": f1_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "validation_precision_score": precision_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_recall_score": recall_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_f1_score": f1_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "test_precision_score": precision_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_recall_score": recall_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_f1_score": f1_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text']))
}
print(metrics)

# with mlflow.start_run(run_name=run_name):
#     # log hyperparameters
#     mlflow.log_params(params)

#     # log metrics
#     mlflow.log_metrics(metrics)
    
#     # Log model
#     mlflow.sklearn.log_model(
#         sk_model=model,
#         artifact_path=artifact_path,
#         signature=signature
#     )

2024-03-01 16:56:36,981 - __main__ - INFO - Splitting Data ...


{'training_precision_score': 0.8267810551452704, 'training_recall_score': 0.9096482265362721, 'training_f1_score': 0.866237316406431, 'validation_precision_score': 0.7695234281137365, 'validation_recall_score': 0.8724177071509648, 'validation_f1_score': 0.8177465687839132, 'test_precision_score': 0.7806267806267806, 'test_recall_score': 0.8710263396911898, 'test_f1_score': 0.8233526507834299}


In [16]:
X_train

Unnamed: 0,preprocessed_review_text
19627,red blue stamp fine green like lime true green...
15287,realize buy 200 dollar programming tool make w...
11857,maddening ford deal reese pre-install reese ba...
29952,light small work fine although speaker small w...
47042,good quality long lead happy purchase
...,...
13217,1st one received wrong car putting aside quali...
17016,original receiver came order stopped working m...
44025,pretty sturdy stencil nice image look forward ...
12723,light weight compact picture show round disc f...


# TfidfVectorizer + MultinomialNB

In [10]:
""" MODEL TRAINING """
mlflow.set_tracking_uri("./mlruns")  # Set tracking server (aka file)
mlflow.set_experiment("Sentiment Classification")  # Set experiment
run_name = "tfidf-multinomialnb-run"  # set run name
artifact_path = "tfidf-multinomialnb"  # set artifact path

# load Data
df = pd.read_csv("../data/processed/sample-clean-data.csv")

# Split Data
df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0}) 
X, y = df.loc[:, ~df.columns.isin(['sentiment'])], df[['sentiment']]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, shuffle=True, random_state=42)

# Modelling
params = {}  # means using default params
model = Pipeline([
    ('TfidfVectorizer', TfidfVectorizer()),
    ('MultinomialNB', MultinomialNB(**params))
])
model.fit(X_train['preprocessed_review_text'], y_train['sentiment'])
signature = mlflow.models.infer_signature(X_train[['preprocessed_review_text']], y_train)

# Evaluate Model
metrics = {
    "training_precision_score": precision_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_recall_score": recall_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "training_f1_score": f1_score(y_train['sentiment'], model.predict(X_train['preprocessed_review_text'])),
    "validation_precision_score": precision_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_recall_score": recall_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "validation_f1_score": f1_score(y_val['sentiment'], model.predict(X_val['preprocessed_review_text'])),
    "test_precision_score": precision_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_recall_score": recall_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text'])),
    "test_f1_score": f1_score(y_test['sentiment'], model.predict(X_test['preprocessed_review_text']))
}

with mlflow.start_run(run_name=run_name):
    # log hyperparameters
    mlflow.log_params(params)

    # log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        signature=signature
    )

