# Using MLflow for Experiment Tracking and Model Management

# Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix


nltk.download("stopwords",quiet = True)
nltk.download("wordnet",quiet = True)

# MLflow
import mlflow
import mlflow.sklearn


# Load the Dataset

In [2]:
df = pd.read_csv("reviews_badminton/data.csv")
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [3]:
df.shape

(8518, 8)

# EDA (Exploratary Data Analysis)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [5]:
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [6]:
# Remove neutral reviews
df = df[df["Ratings"] != 3]

# Create sentiment label
df["sentiment"] = df["Ratings"].apply(lambda x: 1 if x >= 4 else 0)

# Combine title and review text
df["text"] = df["Review Title"].fillna("") + " " + df["Review text"].fillna("")
df = df[["text", "sentiment"]]
df.head()

Unnamed: 0,text,sentiment
0,"Nice product Nice product, good quality, but p...",1
1,Don't waste your money They didn't supplied Yo...,0
2,Did not meet expectations Worst product. Damag...,0
4,Over priced Over pricedJust â?¹620 ..from reta...,0
5,Mind-blowing purchase Good quality product. De...,1


# Text Preprocessing

In [7]:
stop_words = set(stopwords.words("english")) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

In [8]:
df["processed_text"] = df["text"].apply(preprocess)
df.head()

Unnamed: 0,text,sentiment,processed_text
0,"Nice product Nice product, good quality, but p...",1,nice product nice product good quality price r...
1,Don't waste your money They didn't supplied Yo...,0,dont waste money didnt supplied yonex mavis ou...
2,Did not meet expectations Worst product. Damag...,0,not meet expectation worst product damaged shu...
4,Over priced Over pricedJust â?¹620 ..from reta...,0,priced pricedjust retaileri didnt understand w...
5,Mind-blowing purchase Good quality product. De...,1,mindblowing purchase good quality product deli...


# Train-Test Split

In [9]:
X = df["processed_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25,random_state=42,stratify=y)

# TF-IDF Feature Extraction

In [10]:
tfidf = TfidfVectorizer(max_features=7000,ngram_range=(1, 2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Set MLflow Experiment

In [11]:
# Silence MLflow + Alembic logs
import logging
import os

os.environ["ML_FLOW_DISABLE_GIT"]="true"
logging.getLogger("mlflow").setLevel(logging.ERROR)
logging.getLogger("alembic").setLevel(logging.ERROR)


mlflow.set_experiment("Sentiment_Analysis_Experiment_Tracking")

<Experiment: artifact_location='file:C:/Users/shaik syed basha/OneDrive/Desktop/New folder/mlruns/2', creation_time=1770395315078, experiment_id='2', last_update_time=1770395315078, lifecycle_stage='active', name='Sentiment_Analysis_Experiment_Tracking', tags={}>

# Train Multiple Models + Log to MLflow

In [12]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

results = {}


In [13]:
for name, model in models.items():
    with mlflow.start_run(run_name=name):

        model.fit(X_train_tfidf, y_train)
        preds = model.predict(X_test_tfidf)

        f1 = f1_score(y_test, preds, average="weighted")
        results[name] = f1

        # Log parameters
        mlflow.log_param("model_name", name)

        if name == "Logistic Regression":
            mlflow.log_param("max_iter", 1000)
        if name == "Random Forest":
            mlflow.log_param("n_estimators", 200)
        if name == "Linear SVM":
            mlflow.log_param("kernel", "linear")

        # Log metric
        mlflow.log_metric("f1_score", f1)

        # Confusion Matrix (Metric Plot)
        cm = confusion_matrix(y_test, preds)
        plt.figure(figsize=(4,4))
        plt.imshow(cm)
        plt.title(f"{name} Confusion Matrix")
        plt.colorbar()
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.savefig("confusion_matrix.png")
        plt.close()

        mlflow.log_artifact("confusion_matrix.png")

        # Log model
        mlflow.sklearn.log_model(sk_model=model, name="model")

        print(f"{name} F1-score: {f1:.4f}")

Logistic Regression F1-score: 0.9288
Naive Bayes F1-score: 0.8713
Linear SVM F1-score: 0.9474
Random Forest F1-score: 0.9391


# Hyperparameter Tuning (Required for Hyperparameter Plots)

In [14]:
for c in [0.1, 1, 10]:
    with mlflow.start_run(run_name=f"LogReg_C={c}"):

        model = LogisticRegression(C=c, max_iter=1000)
        model.fit(X_train_tfidf, y_train)
        preds = model.predict(X_test_tfidf)

        f1 = f1_score(y_test, preds, average="weighted")

        mlflow.log_param("model", "LogisticRegression")
        mlflow.log_param("C", c)
        mlflow.log_metric("f1_score", f1)

        mlflow.sklearn.log_model(sk_model=model, name="model")

# Final Model Selection & Registration

In [15]:
best_model_name = max(results, key=results.get)
best_model_score = results[best_model_name]

print("\nFinal Model Selected:", best_model_name)
print("Best F1-score:", best_model_score)


Final Model Selected: Linear SVM
Best F1-score: 0.947363331546774


In [16]:
final_model = models[best_model_name]
final_model.fit(X_train_tfidf, y_train)

with mlflow.start_run(run_name="Final_Model_Registration"):
    mlflow.sklearn.log_model(sk_model=final_model, name="final_model",registered_model_name="SentimentAnalysisModel")

    mlflow.log_param("final_model", best_model_name)
    mlflow.log_metric("best_f1_score", results[best_model_name])

Successfully registered model 'SentimentAnalysisModel'.
Created version '1' of model 'SentimentAnalysisModel'.


## MLflow Experiment Tracking with Auto Scheduling

In [17]:
pip install prefect




In [18]:
#from prefect import flow, task

#@task
#def run_mlflow_pipeline():
#   print("Running MLflow sentiment analysis pipeline")

#@flow(name="Sentiment-MLflow-Pipeline")
#def sentiment_flow():
#    run_mlflow_pipeline()

#sentiment_flow()