In [1]:
import os
import io

from azure.storage.filedatalake import DataLakeServiceClient
from azure.storage.blob import BlobServiceClient

import numpy as np 
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

In [2]:
# Replace with your details
storage_account_name = " "
storage_account_key = " "

# Connect to ADLS
service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net",
    credential=storage_account_key,
    api_version="2023-11-03"  # Use the correct supported API version
)

# List Containers
containers = service_client.list_file_systems()
for container in containers:
    print(container.name)

amzecomdata
globalmartmarketingdata
inventorydata
optimalchannel


In [3]:

def read_csv_from_blob(storage_account_name, container_name, file_name, storage_account_key=None):
    """
    Read a CSV file from Azure Blob Storage using Python and return a Pandas DataFrame.

    :param storage_account_name: Azure storage account name.
    :param container_name: Blob container name.
    :param file_name: Name of the file in the container.
    :param storage_account_key: Storage account access key.
    :return: Pandas DataFrame.
    """
    
    if not storage_account_key:
        # Try to get the key from environment variables if not provided
        storage_account_key = os.environ.get('AZURE_STORAGE_KEY')
        
    if not storage_account_key:
        raise ValueError("Storage account key must be provided either as a parameter or as an environment variable 'AZURE_STORAGE_KEY'")
    
    try:
        # Create a connection string
        connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"
        
        # Create the BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        
        # Get the container client
        container_client = blob_service_client.get_container_client(container_name)
        
        # Get the blob client
        blob_client = container_client.get_blob_client(file_name)
        
        # Download the blob content
        download_stream = blob_client.download_blob()
        
        # Convert the content to a DataFrame
        content = download_stream.readall()
        df = pd.read_csv(io.BytesIO(content))
        
        return df
    
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

In [4]:
training_data = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=" ", 
                                      file_name="PreProcessing_final_data.csv",
                                      storage_account_key=storage_account_key)
training_data.head()

Unnamed: 0.1,Unnamed: 0,week,sales_amount,base_price,final_price,promotion_type,facebook_spend,google ads_spend,influencer marketing_spend,instagram_spend,ooh_spend,print_spend,radio_spend,tv_spend,youtube_spend,facebook_ctr,google ads_ctr,influencer marketing_ctr,instagram_ctr,youtube_ctr
0,0,2023-01-01,13516527.77,101.830513,94.148539,Percentage Discount,1152.82,810.68,1122.12,707.48,11230.47,6214.43,6723.33,11311.42,703.13,3.047174,4.333516,2.471559,2.008197,2.116972
1,1,2023-01-08,95081753.02,101.830513,94.148539,Percentage Discount,7472.53,6973.85,7179.02,6834.33,11380.75,11069.6,9505.5,14004.01,6562.0,2.636847,2.732868,2.930657,3.354279,3.570124
2,2,2023-01-15,94804406.04,101.830513,94.148539,Percentage Discount,7204.29,7383.5,7185.08,6963.4,10270.3,8861.17,7836.04,14442.24,7318.19,2.679349,3.136116,2.879586,2.938546,3.776793
3,3,2023-01-22,94833974.28,101.830513,94.148539,Percentage Discount,7726.84,6522.2,7710.31,7479.41,8335.56,11601.91,6663.51,11917.89,7745.75,3.015955,3.836348,2.983655,2.858832,2.823088
4,4,2023-01-29,94806994.45,101.830513,94.148539,Percentage Discount,6987.22,6969.68,7094.25,7294.12,9575.21,7488.18,12158.0,6753.84,7044.42,2.744554,3.511152,2.338256,2.403631,3.01939


In [5]:
# Select features for scaling and encoding
numerical_features = ['sales_amount', 'base_price', 'final_price',
                      'facebook_spend', 'google ads_spend', 'influencer marketing_spend',
                      'instagram_spend', 'ooh_spend', 'print_spend', 'radio_spend',
                      'tv_spend', 'youtube_spend', 'facebook_ctr', 'google ads_ctr',
                      'influencer marketing_ctr', 'instagram_ctr', 'youtube_ctr']

categorical_features = ['promotion_type']

In [6]:
# Step 1: Scale numerical features
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(training_data[numerical_features])

# Convert scaled numerical data to a DataFrame
scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_features)

# Step 2: Encode categorical features
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_data = encoder.fit_transform(training_data[categorical_features])

# Get column names for encoded categorical features
encoded_categorical_columns = encoder.get_feature_names_out(categorical_features)

# Convert encoded categorical data to a DataFrame
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoded_categorical_columns)

# Step 3: Combine scaled numerical data and encoded categorical data
processed_df = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)

# Display the processed DataFrame
processed_df.head()

Unnamed: 0,sales_amount,base_price,final_price,facebook_spend,google ads_spend,influencer marketing_spend,instagram_spend,ooh_spend,print_spend,radio_spend,tv_spend,youtube_spend,facebook_ctr,google ads_ctr,influencer marketing_ctr,instagram_ctr,youtube_ctr,promotion_type_Buy One Get One Free,promotion_type_Percentage Discount
0,-7.20824,-1.888843,-1.079989,-6.138594,-6.036483,-6.258867,-6.41476,0.452694,-1.329475,-0.967204,0.269455,-6.2631,0.408029,2.567653,-1.354243,-2.366897,-2.237695,0.0,1.0
1,0.156877,-1.888843,-1.079989,0.671809,0.104967,0.342237,-0.124517,0.511463,0.407651,0.102746,1.192669,-0.319184,-0.548539,-0.479803,-0.315938,0.70132,1.459348,0.0,1.0
2,0.131833,-1.888843,-1.079989,0.382741,0.513173,0.348841,0.007995,0.077207,-0.382501,-0.539284,1.342926,0.447983,-0.449456,0.287937,-0.431442,-0.246289,1.985146,0.0,1.0
3,0.134503,-1.888843,-1.079989,0.945865,-0.345091,0.921263,0.537766,-0.6794,0.598106,-0.990209,0.477397,0.881749,0.335249,1.621101,-0.196076,-0.427986,-0.441227,0.0,1.0
4,0.132067,-1.888843,-1.079989,0.148817,0.100812,0.24985,0.347535,-0.194618,-0.873741,1.122828,-1.293211,0.170239,-0.29745,1.001963,-1.655725,-1.465558,0.058197,0.0,1.0


In [7]:
# Check if there are any zero or negative values in the numerical columns
print((processed_df[numerical_features] <= 0).sum())

sales_amount                   1
base_price                    27
final_price                   27
facebook_spend                24
google ads_spend              19
influencer marketing_spend    22
instagram_spend               19
ooh_spend                     28
print_spend                   28
radio_spend                   31
tv_spend                      24
youtube_spend                 24
facebook_ctr                  24
google ads_ctr                27
influencer marketing_ctr      31
instagram_ctr                 29
youtube_ctr                   27
dtype: int64


In [8]:
# Replace zero or negative values with a small positive value (e.g., 1e-6)
processed_df[numerical_features] = processed_df[numerical_features].applymap(lambda x: max(x, 1e-6))

# Apply log transformation to the target variable (sales_amount) and features (X)
target_column = 'sales_amount'
X = processed_df.drop(columns=[target_column])
y = processed_df[target_column]

# Apply log transformation (log-log regression)
X_log = np.log1p(X)  # log(1 + x) to handle zero and negative values
y_log = np.log1p(y)  # log(1 + y) to handle zero and negative values

In [9]:
mlflow.set_experiment("lasso-experiment")  

# Start MLflow run
with mlflow.start_run() as run:
    # Initialize Lasso model with a regularization parameter (alpha)
    lasso_model = Lasso(alpha=0.1)  
    
    # Train the model on log-transformed features and target
    lasso_model.fit(X_log, y_log)
    
    # Get the coefficients of the Lasso model
    lasso_coefficients = pd.DataFrame({
        'Feature': X_log.columns,
        'Coefficient': lasso_model.coef_
    }).sort_values(by='Coefficient', ascending=False)
    
    # Logging the model score 
    mlflow.log_metric("r_squared", lasso_model.score(X_log, y_log))
    
    # Logging coefficients 
    for feature, coef in zip(lasso_coefficients['Feature'], lasso_coefficients['Coefficient']):
        mlflow.log_metric(f"coef_{feature}", coef)  
    
    # Save the coefficients DataFrame as a CSV and log it as an artifact
    coefficients_file_path = "/tmp/lasso_coefficients.csv"
    lasso_coefficients.to_csv(coefficients_file_path, index=False)  # Saving DataFrame to CSV file
    
    # Log the CSV file as an artifact 
    mlflow.log_artifact(coefficients_file_path)  # Log the CSV file as an artifact
    
    # Logging the model 
    mlflow.log_param("model_type", "lasso") 
    mlflow.sklearn.log_model(lasso_model, "model", registered_model_name="Lasso_v1") 
    print("Model logged with ID:", run.info.run_id)

  from google.protobuf import service as _service
Registered model 'Lasso_v1' already exists. Creating a new version of this model...


Model logged with ID: 7888d892-67d3-43df-ad9d-28f953359645
🏃 View run orange_bridge_7333y9z7 at: https://eastus.api.azureml.ms/mlflow/v2.0/subscriptions/f09d7e13-c940-4565-8bb0-353e6c00bebb/resourceGroups/bug-bounty-hackathon/providers/Microsoft.MachineLearningServices/workspaces/ml-bug-bounty-hackathon/#/experiments/85b13fd6-eeea-457f-89a2-1ffd6fdefa69/runs/7888d892-67d3-43df-ad9d-28f953359645
🧪 View experiment at: https://eastus.api.azureml.ms/mlflow/v2.0/subscriptions/f09d7e13-c940-4565-8bb0-353e6c00bebb/resourceGroups/bug-bounty-hackathon/providers/Microsoft.MachineLearningServices/workspaces/ml-bug-bounty-hackathon/#/experiments/85b13fd6-eeea-457f-89a2-1ffd6fdefa69


In [10]:
# Get the URI of the model logged in MLflow
model_uri = mlflow.get_artifact_uri("log_reg_model")
print("MLflow Model URI: ", model_uri)

MLflow Model URI:  azureml://eastus.api.azureml.ms/mlflow/v2.0/subscriptions/f09d7e13-c940-4565-8bb0-353e6c00bebb/resourceGroups/bug-bounty-hackathon/providers/Microsoft.MachineLearningServices/workspaces/ML-Bug-Bounty-Hackathon/experiments/85b13fd6-eeea-457f-89a2-1ffd6fdefa69/runs/d949148e-5a14-42df-a91c-a05858544b03/artifacts/log_reg_model


In [11]:
from azureml.core import Workspace

# Load your Azure ML workspace (use a config file or pass connection details directly)
workspace = Workspace.from_config()  # This assumes you have a config file in the current directory

In [12]:
from azureml.core import Environment

# Define a custom environment for the model
myenv = Environment(name="myenv")

# Add required dependencies (e.g., mlflow, scikit-learn)
myenv.python.conda_dependencies.add_pip_package("mlflow")
myenv.python.conda_dependencies.add_pip_package("scikit-learn")
myenv.python.conda_dependencies.add_pip_package("azureml-sdk")

In [13]:
from azureml.core.model import InferenceConfig

# Define the inference configuration
inference_config = InferenceConfig(entry_script="score.py", environment=myenv)

In [14]:
from azureml.core import Model
from azureml.core.webservice import AciWebservice

# Register the model (if not already registered)
model = Model.register(workspace=workspace,
                       model_path= model_uri,
                       model_name="log_reg_model",
                       description="Log-log regression model for sales prediction")

# Define deployment configuration (e.g., ACI for testing)
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model to ACI
service = Model.deploy(workspace=workspace,
                       name="sales-prediction-service",
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config)

# Wait for deployment to complete
service.wait_for_deployment(True)
print(service.state)


Error, provided model path "azureml:/eastus.api.azureml.ms/mlflow/v2.0/subscriptions/f09d7e13-c940-4565-8bb0-353e6c00bebb/resourceGroups/bug-bounty-hackathon/providers/Microsoft.MachineLearningServices/workspaces/ML-Bug-Bounty-Hackathon/experiments/85b13fd6-eeea-457f-89a2-1ffd6fdefa69/runs/d949148e-5a14-42df-a91c-a05858544b03/artifacts/log_reg_model" cannot be found



WebserviceException: WebserviceException:
	Message: Error, provided model path "azureml:/eastus.api.azureml.ms/mlflow/v2.0/subscriptions/f09d7e13-c940-4565-8bb0-353e6c00bebb/resourceGroups/bug-bounty-hackathon/providers/Microsoft.MachineLearningServices/workspaces/ML-Bug-Bounty-Hackathon/experiments/85b13fd6-eeea-457f-89a2-1ffd6fdefa69/runs/d949148e-5a14-42df-a91c-a05858544b03/artifacts/log_reg_model" cannot be found
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Error, provided model path \"azureml:/eastus.api.azureml.ms/mlflow/v2.0/subscriptions/f09d7e13-c940-4565-8bb0-353e6c00bebb/resourceGroups/bug-bounty-hackathon/providers/Microsoft.MachineLearningServices/workspaces/ML-Bug-Bounty-Hackathon/experiments/85b13fd6-eeea-457f-89a2-1ffd6fdefa69/runs/d949148e-5a14-42df-a91c-a05858544b03/artifacts/log_reg_model\" cannot be found"
    }
}