In [0]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.impute import SimpleImputer
import mlflow
import mlflow.sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [0]:
from azure.storage.filedatalake import DataLakeServiceClient

# Replace with your details
storage_account_name = "retaildatasetsblob"
storage_account_key = "cRoMjWAkykST/Jn+RfOFmMKqfeQ6LEVlVbkUqRkNH/bUMAI0iUnPm7Lw8YHHhXM6Wjzr0ogqaFmo+AStAhgIcg=="

# Connect to ADLS
service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net",
    credential=storage_account_key,
    api_version="2023-11-03"  # Use the correct supported API version
)

# List Containers
containers = service_client.list_file_systems()
for container in containers:
    print(container.name)


In [0]:
import os
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io

def read_csv_from_blob(storage_account_name, container_name, file_name, storage_account_key=None):
    """
    Read a CSV file from Azure Blob Storage using Python and return a Pandas DataFrame.

    :param storage_account_name: Azure storage account name.
    :param container_name: Blob container name.
    :param file_name: Name of the file in the container.
    :param storage_account_key: Storage account access key.
    :return: Pandas DataFrame.
    """
    
    if not storage_account_key:
        # Try to get the key from environment variables if not provided
        storage_account_key = os.environ.get('AZURE_STORAGE_KEY')
        
    if not storage_account_key:
        raise ValueError("Storage account key must be provided either as a parameter or as an environment variable 'AZURE_STORAGE_KEY'")
    
    try:
        # Create a connection string
        connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"
        
        # Create the BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        
        # Get the container client
        container_client = blob_service_client.get_container_client(container_name)
        
        # Get the blob client
        blob_client = container_client.get_blob_client(file_name)
        
        # Download the blob content
        download_stream = blob_client.download_blob()
        
        # Convert the content to a DataFrame
        content = download_stream.readall()
        df = pd.read_csv(io.BytesIO(content))
        
        return df
    
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

In [0]:
training_data = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name="globalmartmarketingdata", 
                                      file_name="PreProcessing_final_data.csv",
                                      storage_account_key=storage_account_key)
training_data.head()

## Final Preparing data for training

In [0]:
# Select features for scaling and encoding
numerical_features = ['sales_amount', 'base_price', 'final_price',
                      'facebook_spend', 'google ads_spend', 'influencer marketing_spend',
                      'instagram_spend', 'ooh_spend', 'print_spend', 'radio_spend',
                      'tv_spend', 'youtube_spend', 'facebook_ctr', 'google ads_ctr',
                      'influencer marketing_ctr', 'instagram_ctr', 'youtube_ctr']

categorical_features = ['promotion_type']

In [0]:
# Step 1: Scale numerical features
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(training_data[numerical_features])

# Convert scaled numerical data to a DataFrame
scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_features)

# Step 2: Encode categorical features
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_data = encoder.fit_transform(training_data[categorical_features])

# Get column names for encoded categorical features
encoded_categorical_columns = encoder.get_feature_names_out(categorical_features)

# Convert encoded categorical data to a DataFrame
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoded_categorical_columns)

# Step 3: Combine scaled numerical data and encoded categorical data
processed_df = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)

# Display the processed DataFrame
processed_df.head()

In [0]:
# Check if there are any zero or negative values in the numerical columns
print((processed_df[numerical_features] <= 0).sum())

In [0]:
# Replace zero or negative values with a small positive value (e.g., 1e-6)
processed_df[numerical_features] = processed_df[numerical_features].applymap(lambda x: max(x, 1e-6))

# Now apply log1p to handle the log transformation safely
X_log = np.log1p(processed_df[numerical_features])


In [None]:
mlflow.set_experiment("optimizing-ad-spend-experiment")

## Linear Regression

In [0]:
# Start MLflow run
with mlflow.start_run(run_name="Linear Regression"):
    # Add constant term to X for the intercept in OLS regression
    X_const = sm.add_constant(X)
    
    # Build the additive linear regression model with statsmodels
    model = sm.OLS(y, X_const).fit()
    
    # Print the summary of the model 
    print(model.summary())
    
    # Extract the coefficients of the model
    coefficients = pd.DataFrame({
        'Feature': X_const.columns,
        'Coefficient': model.params
    }).sort_values(by='Coefficient', ascending=False)
    
    # Logging model metrics  
    mlflow.log_metric("r_squared", model.rsquared)  # R-squared value  
    mlflow.log_metric("adjusted_r_squared", model.rsquared_adj)  # Adjusted R-squared  
    mlflow.log_metric("f_statistic", model.fvalue)  # F-statistic  
    mlflow.log_metric("f_pvalue", model.f_pvalue)  # P-value for F-statistic  
  
    # Log residual metrics  
    residuals = model.resid
    mlflow.log_metric("mean_residuals", residuals.mean())  # Mean of residuals  
    mlflow.log_metric("std_residuals", residuals.std())  # Standard deviation of residuals  
    
    # Save the coefficients DataFrame as a CSV and log it as an artifact  
    coefficients_file_path = "/tmp/coefficients_ols.csv"  
    coefficients.to_csv(coefficients_file_path, index=False)  # Saving DataFrame to CSV file  
    mlflow.log_artifact(coefficients_file_path)  # Log the CSV file as an artifact  
  
    # Log residuals as an artifact  
    residuals_file_path = "/tmp/residuals_ols.csv"  
    residuals_df = pd.DataFrame({'Residuals': residuals})  
    residuals_df.to_csv(residuals_file_path, index=False)  
    mlflow.log_artifact(residuals_file_path)  # Log residuals as an artifact
    
    # Logging coefficients 
    for feature, coef in zip(coefficients['Feature'], coefficients['Coefficient']):
        mlflow.log_metric(f"coef_{feature}", coef)  

    # Save the coefficients DataFrame as a CSV and log it as an artifact
    coefficients_file_path = "/tmp/coefficients_ols.csv"
    coefficients.to_csv(coefficients_file_path, index=False)  # Saving DataFrame to CSV file
    
    # Log the CSV file as an artifact (will store it under the run's artifact directory)
    mlflow.log_artifact(coefficients_file_path)  # Log the CSV file as an artifact
    
    # Logging the model with wrong version name
    mlflow.log_param("model_type", "ols")  
    
    conda_env = {  
        'channels': ['defaults'],  
        'dependencies': [  
            'python=3.8',  
            'statsmodels=0.13.2',  
            'mlflow'  
        ],  
        'name': 'mlflow-env'  
    }
     # Infer the model signature  
    predicted_values = model.fittedvalues  # Predicted values from the model  
    signature = infer_signature(X_const, predicted_values)  
    
    # Log the model with a proper version name  
    mlflow.statsmodels.log_model(  
        sm_model=model,  
        artifact_path="model",  
        registered_model_name="OLS_Regression_v1"  ,
        conda_env=conda_env,
        signature=signature
    )
    # Feature importance bar chart  
    plt.figure(figsize=(10, 6))  
    sns.barplot(x="Coefficient", y="Feature", data=coefficients, palette="viridis")  
    plt.title("Feature Coefficients")  
    plt.xlabel("Coefficient Value")  
    plt.ylabel("Feature")  
    feature_importance_path = "/tmp/feature_coefficients.png"  
    plt.savefig(feature_importance_path)  
    mlflow.log_artifact(feature_importance_path)  # Log feature importance plot as an artifact  
    plt.close()
    
    mlflow.register_model()


## log-log regression training

In [0]:
# Apply log transformation to the target variable (sales_amount) and features (X)
target_column = 'sales_amount'
X = processed_df.drop(columns=[target_column])
y = processed_df[target_column]

# Apply log transformation (log-log regression)
X_log = np.log1p(X)  # log(1 + x) to handle zero and negative values
y_log = np.log1p(y)  # log(1 + y) to handle zero and negative values


In [0]:
# Start MLflow run 
with mlflow.start_run(run_name="log-log-regression"):
    # Initialize and train the linear regression model on the log-transformed data
    model = LinearRegression()
    model.fit(X_log, y_log)
    
    # Extract the coefficients of the model 
    coefficients = pd.DataFrame({
        'Feature': X_log.columns,
        'Coefficient': model.coef_
    }).sort_values(by='Coefficient', ascending=False)

    # Logging the metrics and parameters
    mlflow.log_metric("r_squared", model.score(X_log, y_log))  # Logging model score
    mlflow.log_param("model_type", "linear")  # Logging a param 
    
    # Logging coefficients as metrics
    for feature, coef in zip(coefficients['Feature'], coefficients['Coefficient']):
        mlflow.log_metric(f"coef_{feature}", coef) 

    # Save the coefficients DataFrame as a CSV and log it as an artifact
    coefficients_file_path = "/tmp/coefficients.csv"
    coefficients.to_csv(coefficients_file_path, index=False)  # Saving DataFrame to CSV file
    
    # Log the CSV file as an artifact (will store it under the run's artifact directory)
    mlflow.log_artifact(coefficients_file_path)  # Log the CSV file as an artifact
    
    # Logging the model 
    mlflow.sklearn.log_model(model, "model", registered_model_name="LinearRegression_v1") 

## Lasso Regression

In [0]:
# Start MLflow run
with mlflow.start_run(run_name="lasso-regression"):
    # Initialize Lasso model with a regularization parameter (alpha)
    lasso_model = Lasso(alpha=0.1)  
    
    # Train the model on log-transformed features and target
    lasso_model.fit(X_log, y_log)
    
    # Get the coefficients of the Lasso model
    lasso_coefficients = pd.DataFrame({
        'Feature': X_log.columns,
        'Coefficient': lasso_model.coef_
    }).sort_values(by='Coefficient', ascending=False)
    
    # Logging the model score 
    mlflow.log_metric("r_squared", lasso_model.score(X_log, y_log))
    
    # Logging coefficients 
    for feature, coef in zip(lasso_coefficients['Feature'], lasso_coefficients['Coefficient']):
        mlflow.log_metric(f"coef_{feature}", coef)  
    
    # Save the coefficients DataFrame as a CSV and log it as an artifact
    coefficients_file_path = "/tmp/lasso_coefficients.csv"
    lasso_coefficients.to_csv(coefficients_file_path, index=False)  # Saving DataFrame to CSV file
    
    # Log the CSV file as an artifact 
    mlflow.log_artifact(coefficients_file_path)  # Log the CSV file as an artifact
    
    # Logging the model 
    mlflow.log_param("model_type", "lasso") 
    mlflow.sklearn.log_model(lasso_model, "model", registered_model_name="Lasso_v1") 

## Ridge Regression

In [0]:
# Start MLflow run 
with mlflow.start_run(run_name="ridge-regression"):
    # Initialize Ridge model with a regularization parameter (alpha)
    ridge_model = Ridge(alpha=0.1)  
    
    # Train the model on log-transformed features and target
    ridge_model.fit(X_log, y_log)
    
    # Get the coefficients of the Ridge model
    ridge_coefficients = pd.DataFrame({
        'Feature': X_log.columns,
        'Coefficient': ridge_model.coef_
    }).sort_values(by='Coefficient', ascending=False)
    
    # Logging the model score 
    mlflow.log_metric("r_squared", ridge_model.score(X_log, y_log))  
    
    # Logging coefficients 
    for feature, coef in zip(ridge_coefficients['Feature'], ridge_coefficients['Coefficient']):
        mlflow.log_metric(f"coef_{feature}", coef)  
    
    # Save the coefficients DataFrame as a CSV and log it as an artifact
    coefficients_file_path = "/tmp/ridge_coefficients.csv"
    ridge_coefficients.to_csv(coefficients_file_path, index=False)  # Saving DataFrame to CSV file
    
    # Log the CSV file as an artifact (will store it under the run's artifact directory)
    mlflow.log_artifact(coefficients_file_path)  # Log the CSV file as an artifact
    
    # Logging the model 
    mlflow.log_param("model_type", "ridge")  
    mlflow.sklearn.log_model(ridge_model, "model", registered_model_name="Ridge_v1")  


In [0]:
# Start MLflow run 
with mlflow.start_run(run_name="random-forest"):
    # Train the first Random Forest Regressor model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X, y)
    
    # Extract feature importances from the first model
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)

    # Logging feature importances 
    for feature, importance in zip(feature_importances['Feature'], feature_importances['Importance']):
        mlflow.log_metric(f"importance_{feature}", importance) 

    # Train the second Random Forest Regressor model
    rf_model_1 = RandomForestRegressor(n_estimators=30, random_state=42)
    rf_model_1.fit(X, y)
    
    # Extract feature importances from the second model
    feature_importances_1 = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_model_1.feature_importances_
    }).sort_values(by='Importance', ascending=False)

    # Logging feature importances for the second model
    for feature, importance in zip(feature_importances_1['Feature'], feature_importances_1['Importance']):
        mlflow.log_metric(f"importance_{feature}_model_2", importance)  
    
    # Save the feature importances DataFrame from the first model as a CSV and log it as an artifact
    coefficients_file_path = "/tmp/rf_model_1_importances.csv"
    feature_importances.to_csv(coefficients_file_path, index=False)  # Saving feature importances to CSV
    
    # Log the CSV file from the first model as an artifact (will store it under the run's artifact directory)
    mlflow.log_artifact(coefficients_file_path)  # Log the CSV file as an artifact
    
    # Save the feature importances DataFrame from the second model as a CSV and log it as an artifact
    coefficients_file_path_1 = "/tmp/rf_model_2_importances.csv"
    feature_importances_1.to_csv(coefficients_file_path_1, index=False)  # Saving feature importances to CSV
    
    # Log the CSV file from the second model as an artifact (will store it under the run's artifact directory)
    mlflow.log_artifact(coefficients_file_path_1)  # Log the CSV file as an artifact
    
    # Logging the models with wrong version names
    mlflow.log_param("model_type", "random_forest")  # Wrong model type as parameter
    mlflow.sklearn.log_model(rf_model, "model_1", registered_model_name="RandomForest_v1")  
    mlflow.sklearn.log_model(rf_model_1, "model_2", registered_model_name="RandomForest_v2")  


In [0]:
import hyperopt
from hyperopt import fmin, tpe, hp, Trials
from sklearn.metrics import mean_squared_error


# mlflow.set_experiment("hyperopt-experiment") 

# Define the objective function for Hyperopt with some messy elements
def objective(params):
    # Initialize the Random Forest Regressor with hyperparameters from Hyperopt
    rf_model = RandomForestRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        random_state=42
    )

    # Train the model
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test)

    # Calculate Mean Squared Error (MSE) as the objective function to minimize
    mse = mean_squared_error(y_test, y_pred)

    # Logging metrics 
    mlflow.log_metric("mse", mse)  
    mlflow.log_param("random_forest", "yes")  

    # Logging hyperparameters
    for key, value in params.items():
        mlflow.log_param(f"hyperparam_{key}", value) 

    # Save the model as an artifact 
    model_file_path = "/tmp/rf_model.pkl"
    mlflow.sklearn.log_model(rf_model, "model", registered_model_name="RandomForest_v1") 

    # Save  artifact
    with open(model_file_path, "w") as f:
        f.write("This is a broken model file.")  
    mlflow.log_artifact(model_file_path)  

    return mse  

# Define the search space for hyperparameters
search_space = {
    'n_estimators': hp.choice('n_estimators', [50, 100, 150, 200]),  # Number of trees
    'max_depth': hp.choice('max_depth', [None, 10, 20, 30]),  # Max depth of trees
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),  # Min samples required to split
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4]),  # Min samples required in a leaf
}

# Initialize a Trials object to track the optimization process
trials = Trials()

# Start the MLflow experiment run
with mlflow.start_run(run_name="hyperopt-randomforest"):
    # Run the optimization
    best = fmin(fn=objective,  # The objective function to minimize
                space=search_space,  # The search space
                algo=tpe.suggest,  # The optimization algorithm (TPE)
                max_evals=50,  # Number of evaluations to perform
                trials=trials)  # Store the results in Trials object

    # Print the best hyperparameters found by Hyperopt
    print("Best Hyperparameters:", best)

    # Log the trials in a messy way
    for trial in trials:
        mlflow.log_param("trial_result", str(trial)) 