In [1]:
!pip install mlflow



In [2]:
import pandas as pd
import pickle
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder


In [3]:
df_ml=pd.read_csv("CleanedDataSet.csv")

In [4]:
df_ml['Date'] = pd.to_datetime(df_ml['Date'])
df_ml['Month'] = df_ml['Date'].dt.to_period('M').astype(str)

In [5]:
df_ml['Profit'] = (df_ml['Price'] * (1 - df_ml['Discount'] / 100)) * df_ml['Units Sold']

df_ml['Year'] = df_ml['Date'].dt.year
df_ml['Month_Num'] = df_ml['Date'].dt.month
df_ml['Day'] = df_ml['Date'].dt.day
df_ml['DayOfWeek'] = df_ml['Date'].dt.dayofweek
df_ml['DayOfYear'] = df_ml['Date'].dt.dayofyear
df_ml['WeekOfYear'] = df_ml['Date'].dt.isocalendar().week.astype(int)
df_ml['Quarter'] = df_ml['Date'].dt.quarter

# Handle missing values
df_ml.fillna(0, inplace=True)

In [6]:
targets = ['Units Sold', 'Profit']
features = [col for col in df_ml.columns if col not in targets + ['Date', 'Month']]
categorical_features = ['Category', 'Region', 'Weather Condition', 'Holiday/Promotion', 'Year', 'Month_Num', 'DayOfWeek', 'Quarter']
numerical_features = [col for col in features if col not in categorical_features and df_ml[col].dtype in ['int64', 'float64']]

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df_ml[categorical_features])
encoded_cat_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_features))

# Create processed DataFrame
df_processed = pd.concat([df_ml[numerical_features], encoded_cat_df, df_ml[targets]], axis=1)
df_processed['Date'] = df_ml['Date']

# Sort by date
df_processed = df_processed.sort_values('Date')

In [7]:
split_index = int(len(df_processed) * 0.8)
train_df = df_processed.iloc[:split_index]
test_df = df_processed.iloc[split_index:]

X_train = train_df.drop(targets + ['Date'], axis=1)
y_train_units = train_df['Units Sold']
y_train_profit = train_df['Profit']
X_test = test_df.drop(targets + ['Date'], axis=1)
y_test_units = test_df['Units Sold']
y_test_profit = test_df['Profit']
test_dates = test_df['Date']

In [9]:
import pandas as pd
import pickle
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# === Load models ===
with open("xgb_units_model.pkl", "rb") as f:
    units_model = pickle.load(f)

with open("xgb_profit_model.pkl", "rb") as f:
    profit_model = pickle.load(f)

mlflow.set_tracking_uri("http://127.0.0.1:5000")
# 3.2 Set the experiment name (will create one if it doesn't exist)
mlflow.set_experiment("Retail store")
# === Get feature names from models
trained_features = units_model.get_booster().feature_names if hasattr(units_model, "get_booster") else units_model.feature_names_in_

# === Align test data columns to match training features
X_test = X_test[trained_features]

# === Get feature names from models
trained_features = units_model.get_booster().feature_names if hasattr(units_model, "get_booster") else units_model.feature_names_in_

# === Align test data columns to match training features
X_test = X_test[trained_features]

# === Logging function ===
def log_model_with_metrics(model, model_name, X, y_true):
    y_pred = model.predict(X)

    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    with mlflow.start_run(run_name=f"{model_name}_run"):
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("R2_score", r2)
        mlflow.log_metric("MSE", mse)

        # Log the model
        if isinstance(model, xgb.Booster):
            mlflow.xgboost.log_model(model, "model")
        elif "sklearn" in str(type(model)).lower():
            mlflow.sklearn.log_model(model, "model")
        else:
            raise TypeError(f"Unsupported model type: {type(model)}")

        print(f"✅ {model_name} logged with R²={r2:.3f}, MSE={mse:.3f}")

# === Log both models
log_model_with_metrics(units_model, "units_model", X_test, y_test_units)
log_model_with_metrics(profit_model, "profit_model", X_test, y_test_profit)

MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Retail+store (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001CA75797800>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))