## Loading data

In [1]:
from azure.storage.filedatalake import DataLakeServiceClient

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('notebooks/corrected/.env')

# Replace with your details
storage_account_name = "mldebugdevadls"
storage_account_key = os.getenv('AZURE_STORAGE_KEY')

# Connect to ADLS
service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net",
    credential=storage_account_key,
    api_version="2023-11-03"  # Use the correct supported API version
)

# List Containers
containers = service_client.list_file_systems()
for container in containers:
    print(container.name)


data


In [2]:
# Replace with your details
container_name = 'data'

In [3]:
import os
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io

def read_csv_from_blob(storage_account_name, container_name, file_name, storage_account_key=None):
    """
    Read a CSV file from Azure Blob Storage using Python and return a Pandas DataFrame.

    :param storage_account_name: Azure storage account name.
    :param container_name: Blob container name.
    :param file_name: Name of the file in the container.
    :param storage_account_key: Storage account access key.
    :return: Pandas DataFrame.
    """
    
    if not storage_account_key:
        # Try to get the key from environment variables if not provided
        storage_account_key = os.environ.get('AZURE_STORAGE_KEY')
        
    if not storage_account_key:
        raise ValueError("Storage account key must be provided either as a parameter or as an environment variable 'AZURE_STORAGE_KEY'")
    
    try:
        # Create a connection string
        connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"
        
        # Create the BlobServiceClient
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        
        # Get the container client
        container_client = blob_service_client.get_container_client(container_name)
        
        # Get the blob client
        blob_client = container_client.get_blob_client(file_name)
        
        # Download the blob content
        download_stream = blob_client.download_blob()
        
        # Convert the content to a DataFrame
        content = download_stream.readall()
        df = pd.read_csv(io.BytesIO(content))
        
        return df
    
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

In [4]:
online_marketing = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="OnlineMarketingData.csv",
                                      storage_account_key=storage_account_key)
offline_marketing = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="OfflineMarketingData.csv",
                                      storage_account_key=storage_account_key)
sales = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="SalesData.csv",
                                      storage_account_key=storage_account_key)
price = read_csv_from_blob(storage_account_name=storage_account_name,
                                      container_name=container_name, 
                                      file_name="PricingData.csv",
                                      storage_account_key=storage_account_key)

## Media Data Processing
* Online Marketing data 
* Offline Marketing data

In [5]:
online_marketing.head()

Unnamed: 0,date,channel,spend,impressions,clicks
0,2023-01-01,Facebook,1152.82,20117,613
1,2023-01-01,Instagram,707.48,9760,196
2,2023-01-01,YouTube,703.13,11148,236
3,2023-01-01,Google Ads,810.68,12761,553
4,2023-01-01,Influencer Marketing,1122.12,22415,554


In [6]:

# Create backup copies of the four dataframes
online_marketing_backup = online_marketing.copy()
offline_marketing_backup = offline_marketing.copy()
sales_backup = sales.copy()
price_backup = price.copy()

In [7]:

# Reassign df with backups
online_marketing = online_marketing_backup.copy()
offline_marketing = offline_marketing_backup.copy()
sales = sales_backup.copy()
price = price_backup.copy()

In [8]:
online_marketing['date'] = pd.to_datetime(online_marketing['date'])
online_marketing['week'] = pd.to_datetime(online_marketing['date']).dt.to_period('W-SAT').dt.start_time

# Convert week to datetime of yyyy-mm-dd
online_marketing['week'] = pd.to_datetime(online_marketing['week'], format='%Y-%m-%d')

media_weekly = online_marketing.drop(['date'], axis=1)
media_weekly = media_weekly.groupby(['week', 'channel']).sum().reset_index()
media_weekly.head(25)

Unnamed: 0,week,channel,spend,impressions,clicks
0,2023-01-01,Facebook,7658.48,122072,3445
1,2023-01-01,Google Ads,6838.9,111271,3073
2,2023-01-01,Influencer Marketing,6900.8,101912,2904
3,2023-01-01,Instagram,6542.48,85078,2559
4,2023-01-01,YouTube,6460.78,109825,3735
5,2023-01-08,Facebook,7392.16,110288,2939
6,2023-01-08,Google Ads,7631.45,114886,3744
7,2023-01-08,Influencer Marketing,7247.63,108291,2875
8,2023-01-08,Instagram,6773.16,105589,3456
9,2023-01-08,YouTube,6754.75,99524,3643


In [9]:
offline_marketing

Unnamed: 0,week,channel,spend
0,2023-01-01,TV,11311.42
1,2023-01-01,Radio,6723.33
2,2023-01-01,Print,6214.43
3,2023-01-01,OOH,11230.47
4,2023-01-08,TV,14004.01
...,...,...,...
207,2023-12-24,OOH,14623.57
208,2023-12-31,TV,12565.47
209,2023-12-31,Radio,11167.25
210,2023-12-31,Print,9546.23


In [10]:
offline_marketing['week'] = pd.to_datetime(offline_marketing['week'])  # Convert 'week' to datetime format
media = pd.merge(
    media_weekly,
    offline_marketing,
    on=['week', 'channel'],
    how='outer',  # Include all rows from both datasets (outer join)
    suffixes=('_online', '_offline')  # Differentiate overlapping column names
) 

# Calculate CTR (Click-Through Rate)
media['CTR (%)'] = (media['clicks'] / media['impressions']) * 100

# Handle rows where impressions might be zero to avoid division by zero
media['CTR (%)'] = media['CTR (%)'].fillna(0)

media

Unnamed: 0,week,channel,spend_online,impressions,clicks,spend_offline,CTR (%)
0,2023-01-01,Facebook,7658.48,122072.0,3445.0,,2.822105
1,2023-01-01,Google Ads,6838.90,111271.0,3073.0,,2.761726
2,2023-01-01,Influencer Marketing,6900.80,101912.0,2904.0,,2.849517
3,2023-01-01,Instagram,6542.48,85078.0,2559.0,,3.007828
4,2023-01-01,OOH,,,,11230.47,0.000000
...,...,...,...,...,...,...,...
472,2023-12-31,OOH,,,,9881.73,0.000000
473,2023-12-31,Print,,,,9546.23,0.000000
474,2023-12-31,Radio,,,,11167.25,0.000000
475,2023-12-31,TV,,,,12565.47,0.000000


## Sales Data Pre Processing
* Sales data 
* Pricing data

In [11]:
sales.head()

Unnamed: 0,date,city,state,store_id,product_id,sales_quantity
0,2023-01-01,Mumbai,Maharashtra,MUM_01,P001,112
1,2023-01-01,Mumbai,Maharashtra,MUM_01,P002,100
2,2023-01-01,Mumbai,Maharashtra,MUM_01,P003,126
3,2023-01-01,Mumbai,Maharashtra,MUM_01,P004,108
4,2023-01-01,Mumbai,Maharashtra,MUM_01,P005,129


In [12]:
price.head()

Unnamed: 0,date,city,state,store_id,product_id,base_price,promotional_discount,promotion_type,final_price
0,01-01-2023,Mumbai,Maharashtra,MUM_01,P001,134.35,6.72,Percentage Discount,127.63
1,01-01-2023,Mumbai,Maharashtra,MUM_02,P001,135.09,13.51,Percentage Discount,121.58
2,01-01-2023,Mumbai,Maharashtra,MUM_03,P001,136.33,0.0,,136.33
3,01-01-2023,Mumbai,Maharashtra,MUM_04,P001,140.32,0.0,,140.32
4,01-01-2023,Mumbai,Maharashtra,MUM_05,P001,142.32,14.23,Buy One Get One Free,128.09


In [13]:
sales['date'] = pd.to_datetime(sales['date'], format='%Y-%m-%d')
price['date'] = pd.to_datetime(price['date'], dayfirst=True, format='%d-%m-%Y')
price['date'] = pd.to_datetime(price['date'], format='%Y-%m-%d')

In [14]:
sales

Unnamed: 0,date,city,state,store_id,product_id,sales_quantity
0,2023-01-01,Mumbai,Maharashtra,MUM_01,P001,112
1,2023-01-01,Mumbai,Maharashtra,MUM_01,P002,100
2,2023-01-01,Mumbai,Maharashtra,MUM_01,P003,126
3,2023-01-01,Mumbai,Maharashtra,MUM_01,P004,108
4,2023-01-01,Mumbai,Maharashtra,MUM_01,P005,129
...,...,...,...,...,...,...
554795,2023-12-31,Guwahati,Assam,GUW_05,P004,71
554796,2023-12-31,Guwahati,Assam,GUW_05,P005,100
554797,2023-12-31,Guwahati,Assam,GUW_05,P006,96
554798,2023-12-31,Guwahati,Assam,GUW_05,P007,136


In [15]:
price

Unnamed: 0,date,city,state,store_id,product_id,base_price,promotional_discount,promotion_type,final_price
0,2023-01-01,Mumbai,Maharashtra,MUM_01,P001,134.35,6.72,Percentage Discount,127.63
1,2023-01-01,Mumbai,Maharashtra,MUM_02,P001,135.09,13.51,Percentage Discount,121.58
2,2023-01-01,Mumbai,Maharashtra,MUM_03,P001,136.33,0.00,,136.33
3,2023-01-01,Mumbai,Maharashtra,MUM_04,P001,140.32,0.00,,140.32
4,2023-01-01,Mumbai,Maharashtra,MUM_05,P001,142.32,14.23,Buy One Get One Free,128.09
...,...,...,...,...,...,...,...,...,...
43315,2023-12-01,Guwahati,Assam,GUW_01,P019,120.99,6.05,Percentage Discount,114.94
43316,2023-12-01,Guwahati,Assam,GUW_02,P019,119.03,11.90,Percentage Discount,107.13
43317,2023-12-01,Guwahati,Assam,GUW_03,P019,119.97,12.00,Buy One Get One Free,107.97
43318,2023-12-01,Guwahati,Assam,GUW_04,P019,120.34,6.02,,114.32


In [16]:

price['month'] = price['date'].dt.to_period('M')
price

Unnamed: 0,date,city,state,store_id,product_id,base_price,promotional_discount,promotion_type,final_price,month
0,2023-01-01,Mumbai,Maharashtra,MUM_01,P001,134.35,6.72,Percentage Discount,127.63,2023-01
1,2023-01-01,Mumbai,Maharashtra,MUM_02,P001,135.09,13.51,Percentage Discount,121.58,2023-01
2,2023-01-01,Mumbai,Maharashtra,MUM_03,P001,136.33,0.00,,136.33,2023-01
3,2023-01-01,Mumbai,Maharashtra,MUM_04,P001,140.32,0.00,,140.32,2023-01
4,2023-01-01,Mumbai,Maharashtra,MUM_05,P001,142.32,14.23,Buy One Get One Free,128.09,2023-01
...,...,...,...,...,...,...,...,...,...,...
43315,2023-12-01,Guwahati,Assam,GUW_01,P019,120.99,6.05,Percentage Discount,114.94,2023-12
43316,2023-12-01,Guwahati,Assam,GUW_02,P019,119.03,11.90,Percentage Discount,107.13,2023-12
43317,2023-12-01,Guwahati,Assam,GUW_03,P019,119.97,12.00,Buy One Get One Free,107.97,2023-12
43318,2023-12-01,Guwahati,Assam,GUW_04,P019,120.34,6.02,,114.32,2023-12


In [17]:
all_weeks = pd.DataFrame({
    'week': pd.date_range(
        start=price['date'].dt.to_period('M').min().start_time,
        end=price['date'].dt.to_period('M').max().end_time,
        freq='W-SUN'                                             
    )
})

all_weeks

Unnamed: 0,week
0,2023-01-01
1,2023-01-08
2,2023-01-15
3,2023-01-22
4,2023-01-29
5,2023-02-05
6,2023-02-12
7,2023-02-19
8,2023-02-26
9,2023-03-05


In [18]:
all_weeks['month'] = all_weeks['week'].dt.to_period('M')
all_weeks

Unnamed: 0,week,month
0,2023-01-01,2023-01
1,2023-01-08,2023-01
2,2023-01-15,2023-01
3,2023-01-22,2023-01
4,2023-01-29,2023-01
5,2023-02-05,2023-02
6,2023-02-12,2023-02
7,2023-02-19,2023-02
8,2023-02-26,2023-02
9,2023-03-05,2023-03


In [19]:
price_expanded = (
    all_weeks
    .merge(price, on='month', how='left')                           # broadcast monthly price to all weeks in that month :contentReference[oaicite:4]{index=4}
    .drop(columns=['month', 'date'])                                # clean up auxiliary columns
)

price_expanded

Unnamed: 0,week,city,state,store_id,product_id,base_price,promotional_discount,promotion_type,final_price
0,2023-01-01,Mumbai,Maharashtra,MUM_01,P001,134.35,6.72,Percentage Discount,127.63
1,2023-01-01,Mumbai,Maharashtra,MUM_02,P001,135.09,13.51,Percentage Discount,121.58
2,2023-01-01,Mumbai,Maharashtra,MUM_03,P001,136.33,0.00,,136.33
3,2023-01-01,Mumbai,Maharashtra,MUM_04,P001,140.32,0.00,,140.32
4,2023-01-01,Mumbai,Maharashtra,MUM_05,P001,142.32,14.23,Buy One Get One Free,128.09
...,...,...,...,...,...,...,...,...,...
191325,2023-12-31,Guwahati,Assam,GUW_01,P019,120.99,6.05,Percentage Discount,114.94
191326,2023-12-31,Guwahati,Assam,GUW_02,P019,119.03,11.90,Percentage Discount,107.13
191327,2023-12-31,Guwahati,Assam,GUW_03,P019,119.97,12.00,Buy One Get One Free,107.97
191328,2023-12-31,Guwahati,Assam,GUW_04,P019,120.34,6.02,,114.32


In [20]:
sales['date'] = pd.to_datetime(sales['date'], format='%Y-%m-%d')   # parse daily date :contentReference[oaicite:5]{index=5}
sales['week'] = pd.to_datetime(sales['date']).dt.to_period('W-SAT').dt.start_time
sales

Unnamed: 0,date,city,state,store_id,product_id,sales_quantity,week
0,2023-01-01,Mumbai,Maharashtra,MUM_01,P001,112,2023-01-01
1,2023-01-01,Mumbai,Maharashtra,MUM_01,P002,100,2023-01-01
2,2023-01-01,Mumbai,Maharashtra,MUM_01,P003,126,2023-01-01
3,2023-01-01,Mumbai,Maharashtra,MUM_01,P004,108,2023-01-01
4,2023-01-01,Mumbai,Maharashtra,MUM_01,P005,129,2023-01-01
...,...,...,...,...,...,...,...
554795,2023-12-31,Guwahati,Assam,GUW_05,P004,71,2023-12-31
554796,2023-12-31,Guwahati,Assam,GUW_05,P005,100,2023-12-31
554797,2023-12-31,Guwahati,Assam,GUW_05,P006,96,2023-12-31
554798,2023-12-31,Guwahati,Assam,GUW_05,P007,136,2023-12-31


In [428]:
sales_week = (
    sales
    .groupby(['week','city','state','store_id','product_id'], as_index=False)
    .agg({'sales_quantity':'sum'})                                  # sum units sold per week :contentReference[oaicite:7]{index=7}
)

sales_week

Unnamed: 0,week,city,state,store_id,product_id,sales_quantity
0,2023-01-01,Ahmedabad,Gujarat,AHM_01,P001,748
1,2023-01-01,Ahmedabad,Gujarat,AHM_01,P002,677
2,2023-01-01,Ahmedabad,Gujarat,AHM_01,P003,677
3,2023-01-01,Ahmedabad,Gujarat,AHM_01,P004,649
4,2023-01-01,Ahmedabad,Gujarat,AHM_01,P005,688
...,...,...,...,...,...,...
80555,2023-12-31,Pune,Maharashtra,PUN_10,P004,132
80556,2023-12-31,Pune,Maharashtra,PUN_10,P005,138
80557,2023-12-31,Pune,Maharashtra,PUN_10,P006,102
80558,2023-12-31,Pune,Maharashtra,PUN_10,P007,100


In [429]:
df = (
    sales_week
    .merge(
        price_expanded,
        on=['week','city','state','store_id','product_id'],
        how='left'                                                # retain all weekly sales even if price missing :contentReference[oaicite:8]{index=8}
    )
)
df['sales_amount'] = df['sales_quantity'] * df['base_price'] 
df

Unnamed: 0,week,city,state,store_id,product_id,sales_quantity,base_price,promotional_discount,promotion_type,final_price,sales_amount
0,2023-01-01,Ahmedabad,Gujarat,AHM_01,P001,748,83.40,8.34,,75.06,62383.20
1,2023-01-01,Ahmedabad,Gujarat,AHM_01,P002,677,90.85,0.00,,90.85,61505.45
2,2023-01-01,Ahmedabad,Gujarat,AHM_01,P003,677,88.78,13.32,,75.46,60104.06
3,2023-01-01,Ahmedabad,Gujarat,AHM_01,P004,649,82.44,4.12,Percentage Discount,78.32,53503.56
4,2023-01-01,Ahmedabad,Gujarat,AHM_01,P005,688,113.86,5.69,Buy One Get One Free,108.17,78335.68
...,...,...,...,...,...,...,...,...,...,...,...
80555,2023-12-31,Pune,Maharashtra,PUN_10,P004,132,151.81,15.18,Percentage Discount,136.63,20038.92
80556,2023-12-31,Pune,Maharashtra,PUN_10,P005,138,112.37,16.86,Buy One Get One Free,95.51,15507.06
80557,2023-12-31,Pune,Maharashtra,PUN_10,P006,102,140.18,21.03,Percentage Discount,119.15,14298.36
80558,2023-12-31,Pune,Maharashtra,PUN_10,P007,100,110.28,5.51,,104.77,11028.00


In [435]:
# Group by week and aggregate other columns logically
sales_amt = df.groupby('week').agg({
    'sales_amount': 'sum',  # Total sales amount
    'base_price': 'mean',  # Average base price
    'final_price': 'mean',  # Average final price
    'promotion_type': lambda x: x.mode().iloc[0] if not x.mode().empty else None  # Mode of promotion type 
}).reset_index()

In [436]:
sales_amt

Unnamed: 0,week,sales_amount,base_price,final_price,promotion_type
0,2023-01-01,102526100.0,101.830513,94.148539,Percentage Discount
1,2023-01-08,102800200.0,101.830513,94.148539,Percentage Discount
2,2023-01-15,102630500.0,101.830513,94.148539,Percentage Discount
3,2023-01-22,102505800.0,101.830513,94.148539,Percentage Discount
4,2023-01-29,102221500.0,101.830513,94.148539,Percentage Discount
5,2023-02-05,102908700.0,102.116257,94.508197,Percentage Discount
6,2023-02-12,102358800.0,102.116257,94.508197,Percentage Discount
7,2023-02-19,102389100.0,102.116257,94.508197,Percentage Discount
8,2023-02-26,102362800.0,102.116257,94.508197,Percentage Discount
9,2023-03-05,102518600.0,102.070921,94.269888,Percentage Discount


## Final data

In [437]:
# Merge the two dataframes on the `week` column
merged_df = pd.merge(sales_amt, media, on='week', how='left')

In [438]:
merged_df

Unnamed: 0,week,sales_amount,base_price,final_price,promotion_type,channel,spend_online,impressions,clicks,spend_offline,CTR (%)
0,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Facebook,7658.48,122072.0,3445.0,,2.822105
1,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Google Ads,6838.90,111271.0,3073.0,,2.761726
2,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Influencer Marketing,6900.80,101912.0,2904.0,,2.849517
3,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Instagram,6542.48,85078.0,2559.0,,3.007828
4,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,OOH,,,,11230.47,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
472,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,OOH,,,,9881.73,0.000000
473,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,Print,,,,9546.23,0.000000
474,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,Radio,,,,11167.25,0.000000
475,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,TV,,,,12565.47,0.000000


In [439]:
# fill NaN values with 0
merged_df.fillna(0, inplace=True)

merged_df

Unnamed: 0,week,sales_amount,base_price,final_price,promotion_type,channel,spend_online,impressions,clicks,spend_offline,CTR (%)
0,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Facebook,7658.48,122072.0,3445.0,0.00,2.822105
1,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Google Ads,6838.90,111271.0,3073.0,0.00,2.761726
2,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Influencer Marketing,6900.80,101912.0,2904.0,0.00,2.849517
3,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,Instagram,6542.48,85078.0,2559.0,0.00,3.007828
4,2023-01-01,1.025261e+08,101.830513,94.148539,Percentage Discount,OOH,0.00,0.0,0.0,11230.47,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
472,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,OOH,0.00,0.0,0.0,9881.73,0.000000
473,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,Print,0.00,0.0,0.0,9546.23,0.000000
474,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,Radio,0.00,0.0,0.0,11167.25,0.000000
475,2023-12-31,1.482525e+07,102.178921,94.656625,Buy One Get One Free,TV,0.00,0.0,0.0,12565.47,0.000000


In [440]:
merged_df.to_csv('data/preprocessed_data.csv', index=False)

In [449]:
df = merged_df.copy()

In [450]:
online_spend = (
    df[df["spend_online"] > 0]
    .pivot_table(index="week", columns="channel", values="spend_online", aggfunc="sum")
    .add_prefix("spend_online_")
)

#    b) Offline spend by channel
offline_spend = (
    df[df["spend_offline"] > 0]
    .pivot_table(index="week", columns="channel", values="spend_offline", aggfunc="sum")
    .add_prefix("spend_offline_")
)

#    c) Impressions & clicks & ctr for online channels
impressions = (
    df[df["impressions"] > 0]
    .pivot_table(index="week", columns="channel", values="impressions", aggfunc="sum")
    .add_prefix("impr_")
)
clicks = (
    df[df["clicks"] > 0]
    .pivot_table(index="week", columns="channel", values="clicks", aggfunc="sum")
    .add_prefix("clicks_")
)
ctr = (
    df[df["CTR (%)"] > 0]
    .pivot_table(index="week", columns="channel", values="CTR (%)", aggfunc="mean")
    .add_prefix("ctr_")
)

# 3. Aggregate sales and price (they're identical per-channel within a week)
agg_sales = (
    df.groupby("week")
      .agg(
          sales_amount=("sales_amount", "first"),
          base_price=("base_price", "first"),
          final_price=("final_price", "first")
      )
)

# 4. Combine all features together
features = (
    pd.concat([agg_sales,
               online_spend,
               offline_spend,
               impressions,
               clicks,
               ctr],
              axis=1)
    .fillna(0)  # fill missing channels with zero
)


In [451]:
features

Unnamed: 0_level_0,sales_amount,base_price,final_price,spend_online_Facebook,spend_online_Google Ads,spend_online_Influencer Marketing,spend_online_Instagram,spend_online_YouTube,spend_offline_OOH,spend_offline_Print,...,clicks_Facebook,clicks_Google Ads,clicks_Influencer Marketing,clicks_Instagram,clicks_YouTube,ctr_Facebook,ctr_Google Ads,ctr_Influencer Marketing,ctr_Instagram,ctr_YouTube
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-01,102526100.0,101.830513,94.148539,7658.48,6838.9,6900.8,6542.48,6460.78,11230.47,6214.43,...,3445.0,3073.0,2904.0,2559.0,3735.0,2.822105,2.761726,2.849517,3.007828,3.400865
2023-01-08,102800200.0,101.830513,94.148539,7392.16,7631.45,7247.63,6773.16,6754.75,11380.75,11069.6,...,2939.0,3744.0,2875.0,3456.0,3643.0,2.664841,3.258883,2.654884,3.273068,3.660424
2023-01-15,102630500.0,101.830513,94.148539,7356.63,6004.55,7953.68,7765.96,7846.51,10270.3,8861.17,...,3206.0,3812.0,3659.0,3040.0,3516.0,2.904643,3.904498,3.011498,2.6122,2.78048
2023-01-22,102505800.0,101.830513,94.148539,7237.25,7145.31,7162.25,7300.0,7395.38,8335.56,11601.91,...,3227.0,3575.0,2733.0,2461.0,4369.0,2.858079,3.280539,2.662419,2.305257,3.36644
2023-01-29,102221500.0,101.830513,94.148539,6386.66,5924.19,6804.07,6157.35,6489.35,9575.21,7488.18,...,3197.0,2834.0,2617.0,2944.0,2852.0,3.150561,3.464632,2.570878,3.197359,2.978403
2023-02-05,102908700.0,102.116257,94.508197,7564.3,6068.33,6883.89,6921.18,7269.2,12255.92,4697.08,...,3053.0,2447.0,3927.0,3022.0,3352.0,2.776489,3.058635,3.356009,2.854094,2.63346
2023-02-12,102358800.0,102.116257,94.508197,7832.26,7136.55,7285.74,6805.3,6840.69,6348.65,3987.37,...,3441.0,2579.0,2698.0,2649.0,2525.0,2.790143,2.161886,2.563908,2.648629,2.66457
2023-02-19,102389100.0,102.116257,94.508197,6347.22,7954.75,7052.08,7359.33,7395.3,9698.83,9648.21,...,2774.0,3233.0,3781.0,3149.0,3323.0,2.992546,2.72466,3.438805,2.937555,3.268739
2023-02-26,102362800.0,102.116257,94.508197,6957.77,7747.18,7191.43,7461.92,6829.39,3847.06,12065.73,...,2761.0,3251.0,3453.0,3301.0,2591.0,2.293761,3.099674,3.173334,2.949428,2.501496
2023-03-05,102518600.0,102.070921,94.269888,7091.48,5944.91,6524.63,7677.08,6908.3,10679.14,12388.97,...,3260.0,2342.0,3956.0,4142.0,2769.0,3.199937,2.753642,4.119031,3.879113,3.119438


In [452]:
promo = df.groupby("week")["promotion_type"].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_frame()
promo


Unnamed: 0_level_0,promotion_type
week,Unnamed: 1_level_1
2023-01-01,Percentage Discount
2023-01-08,Percentage Discount
2023-01-15,Percentage Discount
2023-01-22,Percentage Discount
2023-01-29,Percentage Discount
2023-02-05,Percentage Discount
2023-02-12,Percentage Discount
2023-02-19,Percentage Discount
2023-02-26,Percentage Discount
2023-03-05,Percentage Discount


In [453]:
# Merge promo with features
features = features.merge(promo, on="week", how="left")
features

Unnamed: 0_level_0,sales_amount,base_price,final_price,spend_online_Facebook,spend_online_Google Ads,spend_online_Influencer Marketing,spend_online_Instagram,spend_online_YouTube,spend_offline_OOH,spend_offline_Print,...,clicks_Google Ads,clicks_Influencer Marketing,clicks_Instagram,clicks_YouTube,ctr_Facebook,ctr_Google Ads,ctr_Influencer Marketing,ctr_Instagram,ctr_YouTube,promotion_type
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-01,102526100.0,101.830513,94.148539,7658.48,6838.9,6900.8,6542.48,6460.78,11230.47,6214.43,...,3073.0,2904.0,2559.0,3735.0,2.822105,2.761726,2.849517,3.007828,3.400865,Percentage Discount
2023-01-08,102800200.0,101.830513,94.148539,7392.16,7631.45,7247.63,6773.16,6754.75,11380.75,11069.6,...,3744.0,2875.0,3456.0,3643.0,2.664841,3.258883,2.654884,3.273068,3.660424,Percentage Discount
2023-01-15,102630500.0,101.830513,94.148539,7356.63,6004.55,7953.68,7765.96,7846.51,10270.3,8861.17,...,3812.0,3659.0,3040.0,3516.0,2.904643,3.904498,3.011498,2.6122,2.78048,Percentage Discount
2023-01-22,102505800.0,101.830513,94.148539,7237.25,7145.31,7162.25,7300.0,7395.38,8335.56,11601.91,...,3575.0,2733.0,2461.0,4369.0,2.858079,3.280539,2.662419,2.305257,3.36644,Percentage Discount
2023-01-29,102221500.0,101.830513,94.148539,6386.66,5924.19,6804.07,6157.35,6489.35,9575.21,7488.18,...,2834.0,2617.0,2944.0,2852.0,3.150561,3.464632,2.570878,3.197359,2.978403,Percentage Discount
2023-02-05,102908700.0,102.116257,94.508197,7564.3,6068.33,6883.89,6921.18,7269.2,12255.92,4697.08,...,2447.0,3927.0,3022.0,3352.0,2.776489,3.058635,3.356009,2.854094,2.63346,Percentage Discount
2023-02-12,102358800.0,102.116257,94.508197,7832.26,7136.55,7285.74,6805.3,6840.69,6348.65,3987.37,...,2579.0,2698.0,2649.0,2525.0,2.790143,2.161886,2.563908,2.648629,2.66457,Percentage Discount
2023-02-19,102389100.0,102.116257,94.508197,6347.22,7954.75,7052.08,7359.33,7395.3,9698.83,9648.21,...,3233.0,3781.0,3149.0,3323.0,2.992546,2.72466,3.438805,2.937555,3.268739,Percentage Discount
2023-02-26,102362800.0,102.116257,94.508197,6957.77,7747.18,7191.43,7461.92,6829.39,3847.06,12065.73,...,3251.0,3453.0,3301.0,2591.0,2.293761,3.099674,3.173334,2.949428,2.501496,Percentage Discount
2023-03-05,102518600.0,102.070921,94.269888,7091.48,5944.91,6524.63,7677.08,6908.3,10679.14,12388.97,...,2342.0,3956.0,4142.0,2769.0,3.199937,2.753642,4.119031,3.879113,3.119438,Percentage Discount


In [454]:
features = pd.get_dummies(
    features,
    columns=["promotion_type"],
    prefix="promo",
    dtype=int
)

In [455]:
features

Unnamed: 0_level_0,sales_amount,base_price,final_price,spend_online_Facebook,spend_online_Google Ads,spend_online_Influencer Marketing,spend_online_Instagram,spend_online_YouTube,spend_offline_OOH,spend_offline_Print,...,clicks_Influencer Marketing,clicks_Instagram,clicks_YouTube,ctr_Facebook,ctr_Google Ads,ctr_Influencer Marketing,ctr_Instagram,ctr_YouTube,promo_Buy One Get One Free,promo_Percentage Discount
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-01,102526100.0,101.830513,94.148539,7658.48,6838.9,6900.8,6542.48,6460.78,11230.47,6214.43,...,2904.0,2559.0,3735.0,2.822105,2.761726,2.849517,3.007828,3.400865,0,1
2023-01-08,102800200.0,101.830513,94.148539,7392.16,7631.45,7247.63,6773.16,6754.75,11380.75,11069.6,...,2875.0,3456.0,3643.0,2.664841,3.258883,2.654884,3.273068,3.660424,0,1
2023-01-15,102630500.0,101.830513,94.148539,7356.63,6004.55,7953.68,7765.96,7846.51,10270.3,8861.17,...,3659.0,3040.0,3516.0,2.904643,3.904498,3.011498,2.6122,2.78048,0,1
2023-01-22,102505800.0,101.830513,94.148539,7237.25,7145.31,7162.25,7300.0,7395.38,8335.56,11601.91,...,2733.0,2461.0,4369.0,2.858079,3.280539,2.662419,2.305257,3.36644,0,1
2023-01-29,102221500.0,101.830513,94.148539,6386.66,5924.19,6804.07,6157.35,6489.35,9575.21,7488.18,...,2617.0,2944.0,2852.0,3.150561,3.464632,2.570878,3.197359,2.978403,0,1
2023-02-05,102908700.0,102.116257,94.508197,7564.3,6068.33,6883.89,6921.18,7269.2,12255.92,4697.08,...,3927.0,3022.0,3352.0,2.776489,3.058635,3.356009,2.854094,2.63346,0,1
2023-02-12,102358800.0,102.116257,94.508197,7832.26,7136.55,7285.74,6805.3,6840.69,6348.65,3987.37,...,2698.0,2649.0,2525.0,2.790143,2.161886,2.563908,2.648629,2.66457,0,1
2023-02-19,102389100.0,102.116257,94.508197,6347.22,7954.75,7052.08,7359.33,7395.3,9698.83,9648.21,...,3781.0,3149.0,3323.0,2.992546,2.72466,3.438805,2.937555,3.268739,0,1
2023-02-26,102362800.0,102.116257,94.508197,6957.77,7747.18,7191.43,7461.92,6829.39,3847.06,12065.73,...,3453.0,3301.0,2591.0,2.293761,3.099674,3.173334,2.949428,2.501496,0,1
2023-03-05,102518600.0,102.070921,94.269888,7091.48,5944.91,6524.63,7677.08,6908.3,10679.14,12388.97,...,3956.0,4142.0,2769.0,3.199937,2.753642,4.119031,3.879113,3.119438,0,1


In [456]:
features.to_csv('data/features.csv', index=False)