In [1]:
import setuptools
import mlflow
import logging
import time
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

import dagshub
import mlflow.sklearn as ms
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import scipy.sparse as ss
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
# ========================== CONFIGURATION ==========================
CONFIG = {
    "data_path": r"F:\MLOps Projects\Drug-Review-Project-MLFlow-\notebooks\data.csv",
    "test_size": 0.25,
    "mlflow_tracking_uri": "https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow",
    "dagshub_repo_owner": "nafiul-araf",
    "dagshub_repo_name": "Drug-Review-Project-MLFlow-",
    "experiment_name": "BoW vs TF-IDF"
}


# ========================== FEATURE ENGINEERING ==========================
VECTORIZERS = {
    'BoW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}


# ========================== ALGORITHMS ==========================
ALGORITHMS = {
    'LogisticRegression': LogisticRegression(),
    'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
    'MultinomialNB': MultinomialNB(),
    'XGBoost': XGBClassifier(),
    'RandomForest': RandomForestClassifier()
}

In [3]:
mlflow.set_tracking_uri(CONFIG["mlflow_tracking_uri"])

dagshub.init(repo_owner=CONFIG["dagshub_repo_owner"], repo_name=CONFIG["dagshub_repo_name"], mlflow=True)

mlflow.set_experiment(CONFIG["experiment_name"])

<Experiment: artifact_location='mlflow-artifacts:/58e077b7763e4fe2a6d5a8f0258c1224', creation_time=1746869944075, experiment_id='1', last_update_time=1746869944075, lifecycle_stage='active', name='BoW vs TF-IDF', tags={}>

In [4]:
def transform_text(text):
    "Transform the text by text lowering, tokenizing, removing special characters, stopwords, punctuations and stemming."
    
    # Lower case the text
    text = text.lower()

    # Tokenize the text
    text = nltk.word_tokenize(text)

    # Remove special characters
    text = [word for word in text if word.isalnum()]

    # Remove stopwords and punctuations
    text = [word for word in text if word not in stopwords.words('english') and word not in string.punctuation]

    # Stemming the texts
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text]

    # Join the tokens back into a single string
    return ' '.join(text)

In [5]:
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        df = df[['review', 'rating']]
        df.loc[:, 'review'] = df['review'].apply(transform_text)
        df['rating'] = df['rating'].map({'positive': 1, 'negative': 0}).infer_objects(copy=False)
        df = df.drop_duplicates()
        df = df.reset_index(drop=True)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

In [10]:
def train_and_evaluate(df):
    with mlflow.start_run(run_name="All Experiments") as parent_run:
        for algo_name, algorithm in ALGORITHMS.items():
            for vec_name, vectorizer in VECTORIZERS.items():
                with mlflow.start_run(run_name=f"{algo_name} with {vec_name}", nested=True) as child_run:
                    try:
                        X = vectorizer.fit_transform(df['review'])
                        y = df['rating']
                        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=CONFIG["test_size"], random_state=42)
                        mlflow.log_params({
                            "vectorizer": vec_name,
                            "algorithm": algo_name,
                            "test_size": CONFIG["test_size"]
                        })

                        # Train model
                        model = algorithm
                        model.fit(X_train, y_train)

                        # Log model parameters
                        log_model_params(algo_name, model)

                        mlflow.log_metric('training accuarcy', model.score(X_train, y_train))

                        # Evaluate model
                        y_pred = model.predict(X_test)
                        metrics = {
                            "accuracy": accuracy_score(y_test, y_pred),
                            "precision": precision_score(y_test, y_pred),
                            "recall": recall_score(y_test, y_pred),
                            "f1_score": f1_score(y_test, y_pred)
                        }
                        mlflow.log_metrics(metrics)

                        # Log model
                        # mlflow.sklearn.log_model(model, "model")
                        input_example = X_test[:5] if not ss.issparse(X_test) else X_test[:5].toarray()
                        ms.log_model(model, "model", input_example=input_example)

                        # Print results for verification
                        print(f"\nAlgorithm: {algo_name}, Vectorizer: {vec_name}")
                        print(f"Metrics: {metrics}")

                    except Exception as e:
                        print(f"Error in training {algo_name} with {vec_name}: {e}")
                        mlflow.log_param("error", str(e))

def log_model_params(algo_name, model):
    """Logs hyperparameters of the trained model to MLflow."""
    params_to_log = {}
    if algo_name == 'LogisticRegression':
        params_to_log["C"] = model.C
    elif algo_name == 'PassiveAggressiveClassifier':
        params_to_log["C"] = model.C
        params_to_log["loss"] = model.loss
    elif algo_name == 'MultinomialNB':
        params_to_log["alpha"] = model.alpha
    elif algo_name == 'XGBoost':
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["learning_rate"] = model.learning_rate
    elif algo_name == 'RandomForest':
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["max_depth"] = model.max_depth

    mlflow.log_params(params_to_log)


# ========================== EXECUTION ==========================
if __name__ == "__main__":
    df = load_data(CONFIG["data_path"])
    train_and_evaluate(df)

Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  4.07it/s]



Algorithm: LogisticRegression, Vectorizer: BoW
Metrics: {'accuracy': 0.7799352750809061, 'precision': 0.8377777777777777, 'recall': 0.8568181818181818, 'f1_score': 0.8471910112359551}


2025/05/10 16:01:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression with BoW at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/54d4e3c34e83434da910109d10a4af0c.
2025/05/10 16:01:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:03<00:00,  1.77it/s]



Algorithm: LogisticRegression, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.7961165048543689, 'precision': 0.7934579439252336, 'recall': 0.9647727272727272, 'f1_score': 0.8707692307692307}


2025/05/10 16:01:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression with TF-IDF at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/6f4df571ae9b42cf96f97d77d9ef14fc.
2025/05/10 16:01:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:03<00:00,  1.78it/s]



Algorithm: PassiveAggressiveClassifier, Vectorizer: BoW
Metrics: {'accuracy': 0.7378640776699029, 'precision': 0.8278301886792453, 'recall': 0.7977272727272727, 'f1_score': 0.8125}


2025/05/10 16:02:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run PassiveAggressiveClassifier with BoW at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/ddc1cd0444df458cb929ce7ba427a9eb.
2025/05/10 16:02:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  4.51it/s]



Algorithm: PassiveAggressiveClassifier, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.7491909385113269, 'precision': 0.8238636363636364, 'recall': 0.8238636363636364, 'f1_score': 0.8238636363636364}


2025/05/10 16:02:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run PassiveAggressiveClassifier with TF-IDF at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/fbbdee33aeed47b7873e54f76ae149e8.
2025/05/10 16:02:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  4.10it/s]



Algorithm: MultinomialNB, Vectorizer: BoW
Metrics: {'accuracy': 0.7775080906148867, 'precision': 0.8207847295864263, 'recall': 0.8795454545454545, 'f1_score': 0.8491497531541415}


2025/05/10 16:03:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run MultinomialNB with BoW at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/42a6279e895d43ee9e768f23d292fc4f.
2025/05/10 16:03:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  4.63it/s]



Algorithm: MultinomialNB, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.7144012944983819, 'precision': 0.7137064071370641, 'recall': 1.0, 'f1_score': 0.8329389493610979}


2025/05/10 16:03:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run MultinomialNB with TF-IDF at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/031500187c054b6994bc9d53f695dbc6.
2025/05/10 16:03:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  4.91it/s]



Algorithm: XGBoost, Vectorizer: BoW
Metrics: {'accuracy': 0.7872168284789643, 'precision': 0.8157625383828045, 'recall': 0.9056818181818181, 'f1_score': 0.8583737210554658}


2025/05/10 16:04:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost with BoW at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/a065aae416ef4a40bebdec69e4d0c226.
2025/05/10 16:04:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:01<00:00,  4.52it/s]



Algorithm: XGBoost, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.7637540453074434, 'precision': 0.8, 'recall': 0.8909090909090909, 'f1_score': 0.843010752688172}


2025/05/10 16:04:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost with TF-IDF at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/f9bf5dbcb8f84fec9de06217b2a261df.
2025/05/10 16:04:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:03<00:00,  1.84it/s]



Algorithm: RandomForest, Vectorizer: BoW
Metrics: {'accuracy': 0.7653721682847896, 'precision': 0.7592267135325131, 'recall': 0.9818181818181818, 'f1_score': 0.8562933597621407}


2025/05/10 16:05:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest with BoW at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/4d39ee73c8244e7bb5355ad1387d19ff.
2025/05/10 16:05:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:03<00:00,  1.96it/s]



Algorithm: RandomForest, Vectorizer: TF-IDF
Metrics: {'accuracy': 0.7580906148867314, 'precision': 0.754601226993865, 'recall': 0.9784090909090909, 'f1_score': 0.8520534388916378}


2025/05/10 16:06:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest with TF-IDF at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/73a829cab82e47a5bc2f3133519d85b1.
2025/05/10 16:06:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.
2025/05/10 16:06:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run All Experiments at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1/runs/e10433aad4a64e509307da2e0bd7364a.
2025/05/10 16:06:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nafiul-araf/Drug-Review-Project-MLFlow-.mlflow/#/experiments/1.


##### MultinomialNB and BoW