In [68]:
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [70]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
from mlflow.models.signature import infer_signature
import mlflow.sklearn
import warnings
from sklearn.model_selection import GridSearchCV

In [71]:
def eval_metrics(actual,pred):
    r2 = r2_score(actual,pred)
    mse = mean_squared_error(actual,pred)
    mae = mean_squared_error(actual,pred)
    return r2,mse,mae

In [72]:
warnings.filterwarnings("ignore")
np.random.seed(40)
# Read the wine-quality csv file from the URL
csv_url = (
    "https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/winequality-red.csv"
)
try:
    data = pd.read_csv(csv_url, sep=";")
except Exception as e:
    logger.exception(
        "Unable to download training & test CSV, check your internet connection. Error: %s", e
    )

In [73]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [74]:
def summarize_dataframe(df):
    """
    Summarizes a pandas DataFrame with both general information and detailed column-level stats.

    Parameters:
        df (pd.DataFrame): The DataFrame to summarize.

    Returns:
        pd.DataFrame: A summary table with per-column metadata including type, nulls, stats, and frequency.
    """

    # 1. Print high-level structure of the DataFrame
    print("BASIC DATAFRAME INFO")
    print(f"- Number of rows: {df.shape[0]}")
    print(f"- Number of columns: {df.shape[1]}")
    print(f"- Column names: {list(df.columns)}")
    print("\n- Column data types:\n", df.dtypes)
    print(f"\n- Total missing values: {df.isnull().sum().sum()}")
    print(f"- Duplicate rows: {df.duplicated().sum()}")
    print("-" * 50)
    
    # 2. Initialize a list to store column-wise summaries
    summary = []

    # 3. Iterate over each column to collect type-specific stats
    for col in df.columns:
        col_data = df[col]  # current column data
        col_type = col_data.dtype  # column dtype

        # If numeric column
        if pd.api.types.is_numeric_dtype(col_data):
            summary.append({
                'Column': col,
                'Type': 'Numeric',
                'Non-Null Count': col_data.notnull().sum(),
                'Missing Values': col_data.isnull().sum(),
                'Mean': col_data.mean(),
                'Std Dev': col_data.std(),
                'Min': col_data.min(),
                'Max': col_data.max(),
                'Unique Values': col_data.nunique()
            })

        # If categorical or string column
        elif pd.api.types.is_categorical_dtype(col_data) or pd.api.types.is_object_dtype(col_data):
            summary.append({
                'Column': col,
                'Type': 'Categorical',
                'Non-Null Count': col_data.notnull().sum(),
                'Missing Values': col_data.isnull().sum(),
                'Most Frequent': col_data.mode().iloc[0] if not col_data.mode().empty else None,
                'Frequency': col_data.value_counts().iloc[0] if not col_data.value_counts().empty else None,
                'Unique Values': col_data.nunique()
            })
            print(f"column : {col}",df[col].unique(),"\n")

        # If datetime column
        elif pd.api.types.is_datetime64_any_dtype(col_data):
            summary.append({
                'Column': col,
                'Type': 'Datetime',
                'Non-Null Count': col_data.notnull().sum(),
                'Missing Values': col_data.isnull().sum(),
                'Min': col_data.min(),
                'Max': col_data.max(),
                'Unique Values': col_data.nunique()
            })

        # Fallback for any other column type
        else:
            summary.append({
                'Column': col,
                'Type': 'Other',
                'Non-Null Count': col_data.notnull().sum(),
                'Missing Values': col_data.isnull().sum(),
                'Unique Values': col_data.nunique()
            })

    # 4. Convert collected summaries into a new DataFrame
    summary_df = pd.DataFrame(summary)

    # 5. Return the summary table
    print("COLUMN-WISE SUMMARY")
    return summary_df

In [75]:
summarize_dataframe(data)

BASIC DATAFRAME INFO
- Number of rows: 1599
- Number of columns: 12
- Column names: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

- Column data types:
 fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

- Total missing values: 0
- Duplicate rows: 240
--------------------------------------------------
COLUMN-WISE SUMMARY


Unnamed: 0,Column,Type,Non-Null Count,Missing Values,Mean,Std Dev,Min,Max,Unique Values
0,fixed acidity,Numeric,1599,0,8.319637,1.741096,4.6,15.9,96
1,volatile acidity,Numeric,1599,0,0.527821,0.17906,0.12,1.58,143
2,citric acid,Numeric,1599,0,0.270976,0.194801,0.0,1.0,80
3,residual sugar,Numeric,1599,0,2.538806,1.409928,0.9,15.5,91
4,chlorides,Numeric,1599,0,0.087467,0.047065,0.012,0.611,153
5,free sulfur dioxide,Numeric,1599,0,15.874922,10.460157,1.0,72.0,60
6,total sulfur dioxide,Numeric,1599,0,46.467792,32.895324,6.0,289.0,144
7,density,Numeric,1599,0,0.996747,0.001887,0.99007,1.00369,436
8,pH,Numeric,1599,0,3.311113,0.154386,2.74,4.01,89
9,sulphates,Numeric,1599,0,0.658149,0.169507,0.33,2.0,96


In [76]:
data.quality.value_counts()

quality
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

In [77]:
# Split the data into training and test sets. (0.66, 0.33) split.
train, test = train_test_split(data,test_size = 0.33)

# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

In [78]:
train_x.shape,test_x.shape,train_y.shape,test_y.shape

((1071, 11), (528, 11), (1071, 1), (528, 1))

In [79]:
# Define parameter grid for alpha and l1_ratio
param_grid = {
    "alpha": [0.1, 0.3, 0.5, 0.8, 1.0],
    "l1_ratio": [0.2, 0.5, 0.8]
}

# Define the base model
model = ElasticNet(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring="r2")
grid_search.fit(train_x, train_y)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on test data
predicted_qualities = best_model.predict(test_x)

# Evaluate the best model
rmse, mae, r2 = eval_metrics(test_y, predicted_qualities)


In [80]:

with mlflow.start_run():
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("Elasticnet model (alpha={:f}, l1_ratio={:f}):".format(alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    params = {"alpha":alpha,
              "l1_ratio":l1_ratio}
    mlflow.log_params(best_params)
    #log the metrics
    mlflow.log_metric("rmse",rmse)
    mlflow.log_metric("mae",mae)
    mlflow.log_metric('r2_score',r2)
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic Elasticnet model for wine quality data")
    mlflow.sklearn.log_model(
        lr, "model", registered_model_name="ElasticnetWineModel")

Elasticnet model (alpha=0.800000, l1_ratio=0.500000):
  RMSE: 0.00868835685359548
  MAE: 0.7126439213196836
  R2: 0.7126439213196836


Registered model 'ElasticnetWineModel' already exists. Creating a new version of this model...
2025/06/10 22:27:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ElasticnetWineModel, version 2


🏃 View run indecisive-ram-136 at: http://127.0.0.1:8080/#/experiments/0/runs/6a06fc49a2f24494ad6f53641d03a672
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


Created version '2' of model 'ElasticnetWineModel'.
