In [1]:
from azure.storage.filedatalake import DataLakeServiceClient
from dotenv import load_dotenv

import os

# Load environment variables from .env file
load_dotenv('notebooks/corrected/.env')

# Replace with your details
storage_account_name = "mldebugdevadls"
storage_account_key = os.getenv('AZURE_STORAGE_KEY')

# Connect to ADLS
service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net",
    credential=storage_account_key,
    api_version="2023-11-03"  # Use the correct supported API version
)

# List Containers
containers = service_client.list_file_systems()
for container in containers:
    print(container.name)


data


In [2]:
import os
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io

def read_csv_from_blob(storage_account_name, container_name, file_name, storage_account_key=None):
    """
    Read a CSV file from Azure Blob Storage using Python and return a Pandas DataFrame.

    :param storage_account_name: Azure storage account name.
    :param container_name: Blob container name.
    :param file_name: Name of the file in the container.
    :param storage_account_key: Storage account access key.
    :return: Pandas DataFrame.
    """
    
    if not storage_account_key:
        # Try to get the key from environment variables if not provided
        storage_account_key = os.environ.get('AZURE_STORAGE_KEY')
        
    if not storage_account_key:
        raise ValueError("Storage account key must be provided either as a parameter or as an environment variable 'AZURE_STORAGE_KEY'")
    
    try:
        # Create a connection string
        connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"
        
        # Create the BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        
        # Get the container client
        container_client = blob_service_client.get_container_client(container_name)
        
        # Get the blob client
        blob_client = container_client.get_blob_client(file_name)
        
        # Download the blob content
        download_stream = blob_client.download_blob()
        
        # Convert the content to a DataFrame
        content = download_stream.readall()
        df = pd.read_csv(io.BytesIO(content))
        
        return df
    
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

# Example usage:
# df = read_csv_from_blob("your_storage_account", "optimalchannel", "city.csv", storage_account_key="your_key")
# print(df.head())

In [3]:
training_data = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name="data", 
                                      file_name="PreProcessingfinaldata.csv",
                                      storage_account_key=storage_account_key)
training_data

Unnamed: 0.1,Unnamed: 0,week,sales_amount,base_price,final_price,promotion_type,facebook_spend,google ads_spend,influencer marketing_spend,instagram_spend,ooh_spend,print_spend,radio_spend,tv_spend,youtube_spend,facebook_ctr,google ads_ctr,influencer marketing_ctr,instagram_ctr,youtube_ctr
0,0,2023-01-01,13516527.77,101.830513,94.148539,Percentage Discount,1152.82,810.68,1122.12,707.48,11230.47,6214.43,6723.33,11311.42,703.13,3.047174,4.333516,2.471559,2.008197,2.116972
1,1,2023-01-08,95081753.02,101.830513,94.148539,Percentage Discount,7472.53,6973.85,7179.02,6834.33,11380.75,11069.6,9505.5,14004.01,6562.0,2.636847,2.732868,2.930657,3.354279,3.570124
2,2,2023-01-15,94804406.04,101.830513,94.148539,Percentage Discount,7204.29,7383.5,7185.08,6963.4,10270.3,8861.17,7836.04,14442.24,7318.19,2.679349,3.136116,2.879586,2.938546,3.776793
3,3,2023-01-22,94833974.28,101.830513,94.148539,Percentage Discount,7726.84,6522.2,7710.31,7479.41,8335.56,11601.91,6663.51,11917.89,7745.75,3.015955,3.836348,2.983655,2.858832,2.823088
4,4,2023-01-29,94806994.45,101.830513,94.148539,Percentage Discount,6987.22,6969.68,7094.25,7294.12,9575.21,7488.18,12158.0,6753.84,7044.42,2.744554,3.511152,2.338256,2.403631,3.01939
5,5,2023-02-05,94790845.6,102.034616,94.405438,Percentage Discount,6960.23,5657.36,6643.44,5966.77,12255.92,4697.08,8960.04,8881.09,6758.85,3.008475,3.412222,2.569407,2.736915,3.023114
6,6,2023-02-12,95334475.79,102.116257,94.508197,Percentage Discount,7015.83,6405.41,7177.54,7298.72,6348.65,3987.37,8863.17,11767.96,7386.22,3.062449,3.139439,3.242249,2.973659,2.551454
7,7,2023-02-19,94535798.85,102.116257,94.508197,Percentage Discount,7842.88,7183.16,6883.84,6863.05,9698.83,9648.21,13535.8,6648.19,6814.58,2.785007,2.028698,2.900965,2.731826,3.078021
8,8,2023-02-26,94881074.24,102.116257,94.508197,Percentage Discount,5973.36,8016.33,7318.09,7478.55,3847.06,12065.73,7152.14,13035.47,7083.74,3.104327,2.811252,3.173663,3.020989,2.734201
9,9,2023-03-05,94627083.0,102.083874,94.337977,Percentage Discount,7510.85,7886.7,6789.57,7380.14,10679.14,12388.97,14007.0,12165.05,6630.46,2.247487,2.868789,3.42511,2.904997,2.969358


In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Select features for scaling and encoding
numerical_features = ['sales_amount', 'base_price', 'final_price',
                      'facebook_spend', 'google ads_spend', 'influencer marketing_spend',
                      'instagram_spend', 'ooh_spend', 'print_spend', 'radio_spend',
                      'tv_spend', 'youtube_spend', 'facebook_ctr', 'google ads_ctr',
                      'influencer marketing_ctr', 'instagram_ctr', 'youtube_ctr']

categorical_features = ['promotion_type']

In [5]:
# Step 1: Scale numerical features
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(training_data[numerical_features])

# Convert scaled numerical data to a DataFrame
scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_features)

# Step 2: Encode categorical features
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_data = encoder.fit_transform(training_data[categorical_features])


# Get column names for encoded categorical features
encoded_categorical_columns = encoder.get_feature_names_out(categorical_features)

# Convert encoded categorical data to a DataFrame
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoded_categorical_columns)

# Step 3: Combine scaled numerical data and encoded categorical data
processed_df = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)

# Display the processed DataFrame
processed_df

Unnamed: 0,sales_amount,base_price,final_price,facebook_spend,google ads_spend,influencer marketing_spend,instagram_spend,ooh_spend,print_spend,radio_spend,tv_spend,youtube_spend,facebook_ctr,google ads_ctr,influencer marketing_ctr,instagram_ctr,youtube_ctr,promotion_type_Buy One Get One Free,promotion_type_Percentage Discount
0,-7.20824,-1.888843,-1.079989,-6.138594,-6.036483,-6.258867,-6.41476,0.452694,-1.329475,-0.967204,0.269455,-6.2631,0.408029,2.567653,-1.354243,-2.366897,-2.237695,0.0,1.0
1,0.156877,-1.888843,-1.079989,0.671809,0.104967,0.342237,-0.124517,0.511463,0.407651,0.102746,1.192669,-0.319184,-0.548539,-0.479803,-0.315938,0.70132,1.459348,0.0,1.0
2,0.131833,-1.888843,-1.079989,0.382741,0.513173,0.348841,0.007995,0.077207,-0.382501,-0.539284,1.342926,0.447983,-0.449456,0.287937,-0.431442,-0.246289,1.985146,0.0,1.0
3,0.134503,-1.888843,-1.079989,0.945865,-0.345091,0.921263,0.537766,-0.6794,0.598106,-0.990209,0.477397,0.881749,0.335249,1.621101,-0.196076,-0.427986,-0.441227,0.0,1.0
4,0.132067,-1.888843,-1.079989,0.148817,0.100812,0.24985,0.347535,-0.194618,-0.873741,1.122828,-1.293211,0.170239,-0.29745,1.001963,-1.655725,-1.465558,0.058197,0.0,1.0
5,0.130609,0.330143,0.30281,0.119731,-1.206883,-0.241464,-1.015213,0.85371,-1.872366,-0.107023,-0.563837,-0.119476,0.317811,0.813612,-1.132949,-0.705881,0.06767,0.0,1.0
6,0.179697,1.217737,0.855929,0.179648,-0.46147,0.340624,0.352258,-1.456408,-2.126292,-0.144277,0.42599,0.517001,0.443638,0.294262,0.388764,-0.166255,-1.132306,0.0,1.0
7,0.107579,1.217737,0.855929,1.070914,0.313539,0.020535,-0.095031,-0.146275,-0.100906,1.652694,-1.329436,-0.062937,-0.203144,-1.820465,-0.38309,-0.71748,0.207362,0.0,1.0
8,0.138756,1.217737,0.855929,-0.943764,1.143773,0.493802,0.536883,-2.434689,0.764055,-0.802294,0.860584,0.21013,0.541265,-0.330569,0.23365,-0.05837,-0.667369,0.0,1.0
9,0.115822,0.865677,-0.060313,0.713104,1.0146,-0.082205,0.435849,0.237089,0.879707,1.833905,0.562141,-0.24973,-1.456228,-0.221025,0.802325,-0.322759,-0.069092,0.0,1.0


In [6]:
target_column = 'sales_amount'
X = processed_df.drop(columns=[target_column])
y = processed_df[target_column]

In [7]:
from sklearn.linear_model import LinearRegression

# Separate the features (X) and target variable (y)
# Assuming 'sales_amount' is the target variable, and rest are features
target_column = 'sales_amount'
X = processed_df.drop(columns=[target_column])
y = processed_df[target_column]

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X, y)

# Extract the coefficients of the model
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

coefficients



Unnamed: 0,Feature,Coefficient
5,instagram_spend,0.340402
10,youtube_spend,0.245266
2,facebook_spend,0.194287
4,influencer marketing_spend,0.157081
3,google ads_spend,0.140632
11,facebook_ctr,0.110103
6,ooh_spend,0.051393
14,instagram_ctr,0.034966
8,radio_spend,0.01218
17,promotion_type_Percentage Discount,0.011617


In [8]:
import statsmodels.api as sm

X = sm.add_constant(X)

# Build the additive linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the model
model.summary()

0,1,2,3
Dep. Variable:,sales_amount,R-squared:,0.952
Model:,OLS,Adj. R-squared:,0.929
Method:,Least Squares,F-statistic:,41.01
Date:,"Mon, 26 May 2025",Prob (F-statistic):,4.05e-18
Time:,15:46:23,Log-Likelihood:,5.3736
No. Observations:,53,AIC:,25.25
Df Residuals:,35,BIC:,60.72
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0028,0.028,-0.100,0.921,-0.059,0.054
base_price,-0.0177,0.055,-0.320,0.751,-0.130,0.095
final_price,0.0083,0.064,0.129,0.898,-0.122,0.138
facebook_spend,0.1943,0.068,2.857,0.007,0.056,0.332
google ads_spend,0.1406,0.084,1.672,0.104,-0.030,0.311
influencer marketing_spend,0.1571,0.084,1.874,0.069,-0.013,0.327
instagram_spend,0.3404,0.087,3.918,0.000,0.164,0.517
ooh_spend,0.0514,0.040,1.282,0.208,-0.030,0.133
print_spend,0.0012,0.042,0.029,0.977,-0.083,0.086

0,1,2,3
Omnibus:,1.524,Durbin-Watson:,1.818
Prob(Omnibus):,0.467,Jarque-Bera (JB):,0.758
Skew:,-0.103,Prob(JB):,0.684
Kurtosis:,3.549,Cond. No.,3.34e+16


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, root_mean_squared_error

# 1. Load your preprocessed DataFrame
df = training_data.copy()   # <-- replace with your actual DataFrame

# 2. Define target & feature sets
TARGET = "sales_amount"
NUM_FEATS = [
    'base_price', 'final_price',
    'facebook_spend', 'google ads_spend', 'influencer marketing_spend',
    'instagram_spend', 'ooh_spend', 'print_spend', 'radio_spend',
    'tv_spend', 'youtube_spend',
    'facebook_ctr', 'google ads_ctr', 'influencer marketing_ctr',
    'instagram_ctr', 'youtube_ctr'
]
CAT_FEATS = ["promotion_type"]

X = df[NUM_FEATS + CAT_FEATS]
y = df[TARGET]

# 3. Train/test split (no leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Preprocessing transformer
preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), NUM_FEATS),
        ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), CAT_FEATS),
    ]
)

# 5. Pipeline with RidgeCV (alphas chosen automatically by CV)
alphas = [0.1, 1.0, 10.0, 100.0, 150, 200.0, 250.0, 300.0, 350.0, 400.0, 450.0, 500.0]
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("reg", RidgeCV(alphas=alphas, cv=5, scoring="r2"))
])

# 6. Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# 7. Evaluate on train
y_pred_train = pipeline.predict(X_train)
r2_train = r2_score(y_train, y_pred_train)
rmse_train = root_mean_squared_error(y_train, y_pred_train)

# 8. Evaluate on test
y_pred_test = pipeline.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

print(f"Best α found by CV: {pipeline.named_steps['reg'].alpha_}")
print(f"Train R²: {r2_train:.3f}, Train RMSE: {rmse_train:,.2f}")
print(f"Test  R²: {r2_test:.3f}, Test  RMSE: {rmse_test:,.2f}")

# 9. (Optional) Inspect regularized coefficients
import numpy as np

# get feature names in order
ohe = pipeline.named_steps["preproc"].named_transformers_["cat"]
ohe_cols = ohe.get_feature_names_out(CAT_FEATS)
feat_names = NUM_FEATS + list(ohe_cols)

coefs = pipeline.named_steps["reg"].coef_
coef_df = pd.DataFrame({
    "feature": feat_names,
    "coefficient": coefs
}).sort_values("coefficient", ascending=False)

print("\nTop coefficients after regularization:")
print(coef_df.head(10))


Best α found by CV: 300.0
Train R²: 0.616, Train RMSE: 7,684,912.82
Test  R²: -66.849, Test  RMSE: 2,440,132.51

Top coefficients after regularization:
                       feature    coefficient
5              instagram_spend  960434.464196
2               facebook_spend  934634.745154
4   influencer marketing_spend  926109.502940
10               youtube_spend  925058.755433
3             google ads_spend  898572.176478
14               instagram_ctr  350219.012240
15                 youtube_ctr  348013.657663
0                   base_price  270442.517397
7                  print_spend  212229.861095
13    influencer marketing_ctr  205822.169468


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV, ElasticNetCV
from sklearn.metrics import r2_score, root_mean_squared_error

# 1. Prepare feature lists & data (reuse from before)
df = training_data.copy()
TARGET = "sales_amount"
NUM_FEATS = [
    'base_price', 'final_price',
    'facebook_spend', 'google ads_spend', 'influencer marketing_spend',
    'instagram_spend', 'ooh_spend', 'print_spend', 'radio_spend',
    'tv_spend', 'youtube_spend',
    'facebook_ctr', 'google ads_ctr', 'influencer marketing_ctr',
    'instagram_ctr', 'youtube_ctr'
]
CAT_FEATS = ["promotion_type"]

X = df[NUM_FEATS + CAT_FEATS]
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2. Shared preprocessor
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), NUM_FEATS),
    ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), CAT_FEATS),
])

# 3a. LassoCV pipeline (L1)
lasso_pipeline = Pipeline([
  ("preproc", preprocessor),
  ("lasso", LassoCV(
    alphas=[0.001, 0.01, 0.1, 1, 10, 50, 100, 200, 500, 1000, 2000, 5000],
    cv=5,
    random_state=42,
    max_iter=10000000
  ))
])
lasso_pipeline.fit(X_train, y_train)

# Evaluate Lasso
y_pred_train = lasso_pipeline.predict(X_train)
y_pred_test  = lasso_pipeline.predict(X_test)
print("=== LassoCV ===")
print(f"Best α: {lasso_pipeline.named_steps['lasso'].alpha_:.4f}")
print(f"Train  R²: {r2_score(y_train, y_pred_train):.3f},  RMSE: {root_mean_squared_error(y_train, y_pred_train):,.0f}")
print(f"Test   R²: {r2_score(y_test,  y_pred_test ):.3f},  RMSE: {root_mean_squared_error(y_test,  y_pred_test):,.0f}")

# 3b. ElasticNetCV pipeline (L1 + L2 mix)
elastic_pipeline = Pipeline([
  ("preproc", preprocessor),
  ("enet", ElasticNetCV(
    alphas=[0.001, 0.01, 0.1, 1, 10, 50, 100, 200, 500],
    l1_ratio=[0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95],
    cv=5,
    random_state=42,
    max_iter=10000000
  ))
])
elastic_pipeline.fit(X_train, y_train)

# Evaluate ElasticNet
y_pred_train = elastic_pipeline.predict(X_train)
y_pred_test  = elastic_pipeline.predict(X_test)
print("\n=== ElasticNetCV ===")
print(f"Best α: {elastic_pipeline.named_steps['enet'].alpha_:.4f}, l1_ratio: {elastic_pipeline.named_steps['enet'].l1_ratio_:.2f}")
print(f"Train  R²: {r2_score(y_train, y_pred_train):.3f},  RMSE: {root_mean_squared_error(y_train, y_pred_train):,.0f}")
print(f"Test   R²: {r2_score(y_test,  y_pred_test ):.3f},  RMSE: {root_mean_squared_error(y_test,  y_pred_test):,.0f}")

# 4. Inspecting coefficients (example for ElasticNet)
ohe = elastic_pipeline.named_steps["preproc"].named_transformers_["cat"]
ohe_cols = ohe.get_feature_names_out(CAT_FEATS)
feat_names = NUM_FEATS + list(ohe_cols)
coefs = elastic_pipeline.named_steps["enet"].coef_
coef_df = pd.DataFrame({"feature": feat_names, "coef": coefs})\
            .sort_values("coef", ascending=False)
print("\nTop ElasticNet coefficients:")
print(coef_df.head(10))


=== LassoCV ===
Best α: 0.0010
Train  R²: 0.970,  RMSE: 2,149,613
Test   R²: -145.811,  RMSE: 3,589,392

=== ElasticNetCV ===
Best α: 10.0000, l1_ratio: 0.30
Train  R²: 0.622,  RMSE: 7,630,018
Test   R²: -67.511,  RMSE: 2,452,011

Top ElasticNet coefficients:
                       feature           coef
5              instagram_spend  972383.764887
2               facebook_spend  946025.068599
4   influencer marketing_spend  937359.843176
10               youtube_spend  936366.793938
3             google ads_spend  909271.365845
14               instagram_ctr  353833.275457
15                 youtube_ctr  351996.746094
0                   base_price  273004.900276
7                  print_spend  214391.308035
13    influencer marketing_ctr  207938.529046


In [11]:
# BEST PERFORMER
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load and prepare data
df = training_data.copy().sort_values('week')  # Ensure chronological order

# 2. Define feature lists
target = "sales_amount"
numerical_features = [
    'base_price', 'final_price',
    'facebook_spend', 'google ads_spend', 
    'influencer marketing_spend', 'instagram_spend',
    'ooh_spend', 'print_spend', 'radio_spend',
    'tv_spend', 'youtube_spend',
    'facebook_ctr', 'google ads_ctr',
    'influencer marketing_ctr', 'instagram_ctr',
    'youtube_ctr'
]
categorical_features = ["promotion_type"]

# 3. Time-based train-test split
test_size = int(len(df) * 0.2)  # Last 20% as test
X_train = df.iloc[:-test_size][numerical_features + categorical_features]
y_train = df.iloc[:-test_size][target]
X_test = df.iloc[-test_size:][numerical_features + categorical_features]
y_test = df.iloc[-test_size:][target]

# 4. Build preprocessing pipeline with proper categorical handling
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(sparse_output=False, 
                            handle_unknown="ignore", 
                            drop='first'),  # Avoid dummy trap
         categorical_features),
    ],
    remainder="drop"
)

# 5. Create pipeline with regularization
model_pipeline = Pipeline(steps=[
    ("preproc", preprocessor),
    ("reg", RidgeCV(alphas=np.logspace(-3, 3, 50),  # Regularization strengths
                   cv=TimeSeriesSplit(n_splits=3))  # Time-series cross-validation
    )
])

# 6. Fit model
model_pipeline.fit(X_train, y_train)

# 7. Evaluate performance
def evaluate_model(X, y, label):
    X_trans = model_pipeline.named_steps["preproc"].transform(X)
    y_pred = model_pipeline.named_steps["reg"].predict(X_trans)
    
    
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print(f"{label} R²: {r2:.3f}")
    print(f"{label} RMSE: {rmse:,.2f}\n")
    return y_pred

print("=== Training Performance ===")
_ = evaluate_model(X_train, y_train, "Train")

print("=== Test Performance ===")
y_test_pred = evaluate_model(X_test, y_test, "Test")

# Calculate MAPE (Mean Absolute Percentage Error)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Avoid division by zero
    nonzero_idx = y_true != 0
    return np.mean(np.abs((y_true[nonzero_idx] - y_pred[nonzero_idx]) / y_true[nonzero_idx])) * 100

print(f"Train MAPE: {mean_absolute_percentage_error(y_train, model_pipeline.predict(X_train)):.2f}%")
print(f"Test  MAPE: {mean_absolute_percentage_error(y_test, y_test_pred):.2f}%")

# 8. Analyze coefficients
feature_names = numerical_features.copy()
cat_features = model_pipeline.named_steps["preproc"].named_transformers_["cat"].get_feature_names_out(categorical_features)
feature_names.extend(cat_features)

coefficients = pd.DataFrame({
    "feature": feature_names,
    "coefficient": model_pipeline.named_steps["reg"].coef_
}).sort_values("coefficient", key=abs, ascending=False)

print("Top 10 Features by Absolute Coefficient:")
print(coefficients.head(10))

print("\nOptimal alpha:", model_pipeline.named_steps["reg"].alpha_)

=== Training Performance ===
Train R²: 0.831
Train RMSE: 5,034,940.12

=== Test Performance ===
Test R²: -56.121
Test RMSE: 2,014,263.43

Train MAPE: 6.84%
Test  MAPE: 1.82%
Top 10 Features by Absolute Coefficient:
                       feature   coefficient
5              instagram_spend  1.634533e+06
10               youtube_spend  1.585825e+06
4   influencer marketing_spend  1.541603e+06
2               facebook_spend  1.524847e+06
3             google ads_spend  1.388846e+06
14               instagram_ctr  5.259657e+05
15                 youtube_ctr  5.074430e+05
12              google ads_ctr -5.009230e+05
0                   base_price  3.851755e+05
8                  radio_spend  2.828288e+05

Optimal alpha: 104.81131341546853


## Detecting Multi-collinearity

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# a) Split out just the raw numeric spends from your existing train/test split
df = training_data.copy()
NUM_SPENDS = [
    'facebook_spend','google ads_spend','influencer marketing_spend',
    'instagram_spend','ooh_spend','print_spend','radio_spend','tv_spend','youtube_spend'
]

X_train, X_test = train_test_split(df[NUM_SPENDS], test_size=0.2, random_state=42)

# b) Scale them (VIF needs no leakage concerns here)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=NUM_SPENDS)

# c) Compute VIF for each spend variable
vif = pd.DataFrame({
    'feature': NUM_SPENDS,
    'VIF': [variance_inflation_factor(X_scaled.values, i) 
            for i in range(len(NUM_SPENDS))]
})
print(vif.sort_values('VIF', ascending=False))


                      feature       VIF
2  influencer marketing_spend  5.164607
3             instagram_spend  5.117404
1            google ads_spend  5.067880
8               youtube_spend  4.641539
0              facebook_spend  4.270088
6                 radio_spend  1.369327
5                 print_spend  1.221520
4                   ooh_spend  1.144587
7                    tv_spend  1.112809


In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, root_mean_squared_error

# 1. Create agg features in the full df
df = training_data.copy()
df['offline_spend'] = (
    df['tv_spend'] + df['radio_spend'] + df['ooh_spend'] + df['print_spend']
)
df['online_spend'] = (
    df['facebook_spend'] + df['google ads_spend'] +
    df['instagram_spend'] + df['influencer marketing_spend'] +
    df['youtube_spend']
)

# 2. Define new feature lists
TARGET = 'sales_amount'
NUM_FEATS_AGG = [
    # 'base_price','final_price',
    'offline_spend','online_spend',
    # 'facebook_ctr','google ads_ctr',
    # 'instagram_ctr','influencer marketing_ctr','youtube_ctr'
]
CAT_FEATS = ['promotion_type']

X = df[NUM_FEATS_AGG + CAT_FEATS]
y = df[TARGET]

# 3. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Build pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), NUM_FEATS_AGG),
    ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), CAT_FEATS)
])
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("reg", RidgeCV(alphas=[0.1,1,10,100], cv=5))
])

# 5. Fit & evaluate
pipeline.fit(X_train, y_train)
for split, (X_, y_, name) in enumerate([
    (X_train, y_train, "Train"),
    (X_test,  y_test,  "Test")
]):
    y_pred = pipeline.predict(X_)
    print(f"{name}  R²: {r2_score(y_, y_pred):.3f} | RMSE: {root_mean_squared_error(y_, y_pred):,.0f}")
    
    # add MAPE
    mape = np.mean(np.abs((y_ - y_pred) / y_)) * 100
    print(f"{name}  MAPE: {mape:.2f}%\n")

# 6. Inspect new coefs
feat_names = NUM_FEATS_AGG + list(
    pipeline.named_steps["preproc"]
            .named_transformers_["cat"]
            .get_feature_names_out(CAT_FEATS)
)
coefs = pipeline.named_steps["reg"].coef_
coef_df = pd.DataFrame({"feature": feat_names, "coef": coefs})\
            .sort_values("coef", ascending=False)
print("\nCoefficients on aggregated channels:")
print(coef_df.loc[coef_df['feature'].isin(['online_spend','offline_spend'])])


Train  R²: 0.489 | RMSE: 8,870,029
Train  MAPE: 11.45%

Test  R²: -50.110 | RMSE: 2,117,859
Test  MAPE: 1.90%


Coefficients on aggregated channels:
         feature          coef
1   online_spend  3.576023e+06
0  offline_spend  3.934703e+05


In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_percentage_error

# --- 1) Adstock + Saturation Functions ---

def compute_adstock(spend, decay):
    """Geometric adstock: A_t = spend_t + decay * A_{t-1}"""
    ad = np.zeros(len(spend))
    ad[0] = spend.iloc[0]
    for t in range(1, len(spend)):
        ad[t] = spend.iloc[t] + decay * ad[t-1]
    return pd.Series(ad, index=spend.index)

def hill_saturation(adstock, alpha=1, k=None):
    """Hill curve: f(A) = A^alpha / (A^alpha + k^alpha)"""
    if k is None:
        k = np.median(adstock)
    num = adstock ** alpha
    den = num + (k ** alpha)
    return num / den

# --- 2) Prepare Data & Features ---

CHANNELS = [
    "tv_spend", "radio_spend",
    "facebook_spend", "google ads_spend",
    "instagram_spend", "influencer marketing_spend",
    "youtube_spend"
]
TARGET = "sales_amount"

df = training_data.copy().sort_values("week").set_index("week")
train, test = train_test_split(df, test_size=0.2, shuffle=False)

for decay in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    print(f"Decay: {decay:.1f}")
    for alpha in [1.0, 2.0, 3.0, 4.0, 5.0]:
        X_train = pd.DataFrame(index=train.index)
        X_test  = pd.DataFrame(index=test.index)
        # build adstock + saturated features
        for ch in CHANNELS:
            ad_t = compute_adstock(train[ch], decay)
            ad_v = compute_adstock(test[ch],  decay)
            X_train[f"{ch}_adstock"]    = ad_t
            X_train[f"{ch}_saturated"]  = hill_saturation(ad_t, alpha=alpha)
            X_test [f"{ch}_adstock"]    = ad_v
            X_test [f"{ch}_saturated"]  = hill_saturation(ad_v, alpha=alpha)
        # add prices and CTRs
        extra = [c for c in train.columns if "price" in c or "ctr" in c]
        X_train = pd.concat([X_train, train[extra]], axis=1)
        X_test  = pd.concat([X_test,  test[extra]],  axis=1)
        # one-hot promotion_type
        X_train = pd.get_dummies(X_train.join(train["promotion_type"]), drop_first=True)
        X_test  = pd.get_dummies(X_test.join(test ["promotion_type"]),  drop_first=True)
        X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)
        y_train = train[TARGET]
        y_test  = test[TARGET]

        # --- 3) Fit RidgeCV ---
        alphas = [0.1, 1, 10, 100]
        model = RidgeCV(alphas=alphas, cv=5, scoring="r2")
        model.fit(X_train, y_train)

        # --- 4) Evaluate with RMSE & MAPE ---
        for name, (X_, y_) in [("Train", (X_train, y_train)), ("Test", (X_test, y_test))]:
            y_pred = model.predict(X_)
            r2  = r2_score(y_, y_pred)
            rmse = root_mean_squared_error(y_, y_pred)
            mape = mean_absolute_percentage_error(y_, y_pred) * 100
            print(f"{name} R²: {r2:.3f} | RMSE: {rmse:,.0f} | MAPE: {mape:.2f}%")

        print("Best α:", model.alpha_)
        print("-" * 40)


Decay: 0.1
Train R²: 0.956 | RMSE: 2,606,794 | MAPE: 2.70%
Test R²: -200.022 | RMSE: 3,900,499 | MAPE: 2.99%
Best α: 10.0
----------------------------------------
Train R²: 0.956 | RMSE: 2,608,302 | MAPE: 2.70%
Test R²: -200.239 | RMSE: 3,902,600 | MAPE: 2.99%
Best α: 10.0
----------------------------------------
Train R²: 0.956 | RMSE: 2,606,383 | MAPE: 2.70%
Test R²: -199.893 | RMSE: 3,899,242 | MAPE: 2.99%
Best α: 10.0
----------------------------------------
Train R²: 0.956 | RMSE: 2,598,813 | MAPE: 2.69%
Test R²: -198.679 | RMSE: 3,887,445 | MAPE: 2.98%
Best α: 10.0
----------------------------------------
Train R²: 0.987 | RMSE: 1,423,482 | MAPE: 1.47%
Test R²: -70.890 | RMSE: 2,332,549 | MAPE: 2.00%
Best α: 0.1
----------------------------------------
Decay: 0.2
Train R²: 0.936 | RMSE: 3,147,410 | MAPE: 3.00%
Test R²: -505.323 | RMSE: 6,190,303 | MAPE: 3.93%
Best α: 100.0
----------------------------------------
Train R²: 0.936 | RMSE: 3,147,665 | MAPE: 3.00%
Test R²: -505.392 |