In [1]:
import pandas as pd
# Load all datasets
sales = pd.read_csv('../../data/processed/sales_processed.csv')
products = pd.read_csv('../../data/processed/products_processed.csv')
cities = pd.read_csv('../../data/processed/city_processed.csv')
competitor = pd.read_csv('../../data/processed/competitors_processed.csv')
market_influence = pd.read_csv('../../data/processed/market_influencers_processed.csv')
customers = pd.read_csv('../../data/processed/consumer_behavior_processed.csv')

In [2]:

# Convert dates to datetime
sales['Date'] = pd.to_datetime(sales['Date'])
competitor['Date'] = pd.to_datetime(competitor['Date'])
market_influence['Week_Start_Date'] = pd.to_datetime(market_influence['Week_Start_Date'])

In [3]:
# Merge all datasets
# Base sales data with products
merged = sales.merge(
    products.rename(columns={'SKU Identification Number': 'SKU_ID'}),
    on='SKU_ID'
)

# Add city data
merged = merged.merge(cities, on='City_ID')



In [4]:


def align_and_fill_weekly(daily_df, weekly_df, group_cols=['City_ID']):
    """Align weekly data with forward-fill imputation"""
    # Merge with weekly data
    merged = pd.merge_asof(
        daily_df.sort_values('Date'),
        weekly_df.sort_values('Date'),
        on='Date',
        by=group_cols,
        direction='nearest',
        tolerance=pd.Timedelta('7D')
    )
    
    # Forward fill weekly values within groups
    weekly_cols = list(weekly_df.columns.difference(daily_df.columns))
    merged[weekly_cols] = merged.groupby(group_cols)[weekly_cols].ffill()
    
    return merged
market_influence.rename(columns={'Week_Start_Date': 'Date'}, inplace=True)

merged = align_and_fill_weekly(
    merged,
    market_influence.groupby(['City_ID', pd.Grouper(key='Date', freq='W-MON')])
    .agg({'Avg_Temperature': 'mean', 'Weather_Type': lambda x: x.mode()[0], 'Festival': 'first'})
    .reset_index(),
    group_cols=['City_ID']
)

merged.head(1000)

Unnamed: 0,Date,City_ID,SKU_ID,Channel,Units_Sold,Sales,Product Name,Flavor Variant,Launch Date,Pack Size (L),...,Modern Trade,HoReCa,Q Commerce,City_Name,City_tier,Population_Density(persons/km),Per_Capita_Income (INR),Avg_Temperature,Weather_Type,Festival
0,2023-01-01,CT001,SKU1002,Q Commerce,268,80.4,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,1,1,1,Delhi,Tier 1,14893,461910,21.0,Cold,No Festival
1,2023-01-01,CT014,SKU1007,General Trade,428,32100.0,Minute Maid 250ml Pulpy Orange,Orange,2015-04-23,0.25,...,1,1,0,Chandigarh,Tier 2,9250,349000,19.7,Cold,No Festival
2,2023-01-01,CT014,SKU1002,General Trade,1125,337.5,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,1,1,1,Chandigarh,Tier 2,9250,349000,19.7,Cold,No Festival
3,2023-01-01,CT014,SKU1001,General Trade,938,281.4,Minute Maid Apple Juice - Honey Infused,Apple,2022-02-19,1.00,...,0,0,0,Chandigarh,Tier 2,9250,349000,19.7,Cold,No Festival
4,2023-01-01,CT014,SKU1008,Modern Trade,207,15525.0,Minute Maid 250ml Mixed Fruit Juice,Mixed Fruit,2015-04-23,0.25,...,1,1,0,Chandigarh,Tier 2,9250,349000,19.7,Cold,No Festival
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-01-03,CT018,SKU1007,General Trade,288,21600.0,Minute Maid 250ml Pulpy Orange,Orange,2015-04-23,0.25,...,1,1,0,Patna,Tier 2,2805,173305,19.8,Cold,No Festival
996,2023-01-03,CT016,SKU1002,Q Commerce,187,56.1,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,1,1,1,Nagpur,Tier 2,9664,296607,21.3,Cold,No Festival
997,2023-01-03,CT018,SKU1002,General Trade,288,86.4,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,1,1,1,Patna,Tier 2,2805,173305,19.8,Cold,No Festival
998,2023-01-03,CT018,SKU1008,Modern Trade,92,6900.0,Minute Maid 250ml Mixed Fruit Juice,Mixed Fruit,2015-04-23,0.25,...,1,1,0,Patna,Tier 2,2805,173305,19.8,Cold,No Festival


In [5]:


merged = align_and_fill_weekly(
    merged,
    competitor.groupby(['Channel', pd.Grouper(key='Date', freq='W-MON')])
    .agg({'Mentions_Count': 'sum', 'Sentiment_Score': 'mean', 'Share_of_Voice': 'mean'})
    .reset_index(),
    group_cols=['Channel']
)

merged.head(1000)

Unnamed: 0,Date,City_ID,SKU_ID,Channel,Units_Sold,Sales,Product Name,Flavor Variant,Launch Date,Pack Size (L),...,City_Name,City_tier,Population_Density(persons/km),Per_Capita_Income (INR),Avg_Temperature,Weather_Type,Festival,Mentions_Count,Sentiment_Score,Share_of_Voice
0,2023-01-01,CT001,SKU1002,Q Commerce,268,80.4,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,Delhi,Tier 1,14893,461910,21.0,Cold,No Festival,531,69.000000,50.000000
1,2023-01-01,CT010,SKU1008,HoReCa,8,600.0,Minute Maid 250ml Mixed Fruit Juice,Mixed Fruit,2015-04-23,0.25,...,Lucknow,Tier 2,3814,105000,18.4,Cold,No Festival,543,67.666667,33.333333
2,2023-01-01,CT001,SKU1001,E Commerce,521,156.3,Minute Maid Apple Juice - Honey Infused,Apple,2022-02-19,1.00,...,Delhi,Tier 1,14893,461910,21.0,Cold,No Festival,1044,67.400000,33.366667
3,2023-01-01,CT001,SKU1002,E Commerce,247,74.1,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,Delhi,Tier 1,14893,461910,21.0,Cold,No Festival,1044,67.400000,33.366667
4,2023-01-01,CT001,SKU1007,E Commerce,161,12075.0,Minute Maid 250ml Pulpy Orange,Orange,2015-04-23,0.25,...,Delhi,Tier 1,14893,461910,21.0,Cold,No Festival,1044,67.400000,33.366667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-01-03,CT007,SKU1001,E Commerce,229,68.7,Minute Maid Apple Juice - Honey Infused,Apple,2022-02-19,1.00,...,Ahmedabad,Tier 1,12984,302588,21.0,Cold,No Festival,1044,67.400000,33.366667
996,2023-01-03,CT003,SKU1004,Modern Trade,95,28.5,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,Kolkata,Tier 1,24252,171184,19.8,Cold,No Festival,884,73.400000,50.000000
997,2023-01-03,CT006,SKU1004,Q Commerce,85,25.5,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,Hyderabad,Tier 1,10477,400000,18.8,Cold,No Festival,531,69.000000,50.000000
998,2023-01-03,CT008,SKU1004,Modern Trade,209,62.7,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,Pune,Tier 1,6034,336503,20.4,Cold,No Festival,884,73.400000,50.000000


In [6]:

# Add customer preferences
customer_agg = customers.groupby('City_ID').agg({
    'Preferred_Channel': lambda x: x.mode()[0],
    'Income_Level': 'mean',
    'Price_Sensitivity': lambda x: x.mode()[0],
    'Purchase_Frequency': 'mean'
}).reset_index().add_prefix('Customer_')

merged = merged.merge(
    customer_agg.rename(columns={'Customer_City_ID': 'City_ID'}),
    on='City_ID',
    how='left'
)

merged.head(1000)

Unnamed: 0,Date,City_ID,SKU_ID,Channel,Units_Sold,Sales,Product Name,Flavor Variant,Launch Date,Pack Size (L),...,Avg_Temperature,Weather_Type,Festival,Mentions_Count,Sentiment_Score,Share_of_Voice,Customer_Preferred_Channel,Customer_Income_Level,Customer_Price_Sensitivity,Customer_Purchase_Frequency
0,2023-01-01,CT001,SKU1002,Q Commerce,268,80.4,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,21.0,Cold,No Festival,531,69.000000,50.000000,E-commerce,520661.105099,Low,3.111607
1,2023-01-01,CT010,SKU1008,HoReCa,8,600.0,Minute Maid 250ml Mixed Fruit Juice,Mixed Fruit,2015-04-23,0.25,...,18.4,Cold,No Festival,543,67.666667,33.333333,E-commerce,120715.451706,High,3.057965
2,2023-01-01,CT001,SKU1001,E Commerce,521,156.3,Minute Maid Apple Juice - Honey Infused,Apple,2022-02-19,1.00,...,21.0,Cold,No Festival,1044,67.400000,33.366667,E-commerce,520661.105099,Low,3.111607
3,2023-01-01,CT001,SKU1002,E Commerce,247,74.1,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,21.0,Cold,No Festival,1044,67.400000,33.366667,E-commerce,520661.105099,Low,3.111607
4,2023-01-01,CT001,SKU1007,E Commerce,161,12075.0,Minute Maid 250ml Pulpy Orange,Orange,2015-04-23,0.25,...,21.0,Cold,No Festival,1044,67.400000,33.366667,E-commerce,520661.105099,Low,3.111607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-01-03,CT007,SKU1001,E Commerce,229,68.7,Minute Maid Apple Juice - Honey Infused,Apple,2022-02-19,1.00,...,21.0,Cold,No Festival,1044,67.400000,33.366667,E-commerce,341307.248214,High,3.062574
996,2023-01-03,CT003,SKU1004,Modern Trade,95,28.5,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,19.8,Cold,No Festival,884,73.400000,50.000000,E-commerce,194478.497198,High,3.026316
997,2023-01-03,CT006,SKU1004,Q Commerce,85,25.5,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,18.8,Cold,No Festival,531,69.000000,50.000000,E-commerce,457865.997231,Low,3.069527
998,2023-01-03,CT008,SKU1004,Modern Trade,209,62.7,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,20.4,Cold,No Festival,884,73.400000,50.000000,E-commerce,379895.103012,High,3.033946


In [7]:
merged['Days_Since_Launch'] = (
    merged['Date'] - pd.to_datetime(merged['Launch Date'])
).dt.days

# Temporal features
merged['Day_of_week'] = merged['Date'].dt.dayofweek
merged['Month'] = merged['Date'].dt.month
merged['Is_month_end'] = merged['Date'].dt.is_month_end.astype(int)

merged.head(1000)

Unnamed: 0,Date,City_ID,SKU_ID,Channel,Units_Sold,Sales,Product Name,Flavor Variant,Launch Date,Pack Size (L),...,Sentiment_Score,Share_of_Voice,Customer_Preferred_Channel,Customer_Income_Level,Customer_Price_Sensitivity,Customer_Purchase_Frequency,Days_Since_Launch,Day_of_week,Month,Is_month_end
0,2023-01-01,CT001,SKU1002,Q Commerce,268,80.4,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,69.000000,50.000000,E-commerce,520661.105099,Low,3.111607,2810,6,1,0
1,2023-01-01,CT010,SKU1008,HoReCa,8,600.0,Minute Maid 250ml Mixed Fruit Juice,Mixed Fruit,2015-04-23,0.25,...,67.666667,33.333333,E-commerce,120715.451706,High,3.057965,2810,6,1,0
2,2023-01-01,CT001,SKU1001,E Commerce,521,156.3,Minute Maid Apple Juice - Honey Infused,Apple,2022-02-19,1.00,...,67.400000,33.366667,E-commerce,520661.105099,Low,3.111607,316,6,1,0
3,2023-01-01,CT001,SKU1002,E Commerce,247,74.1,Minute Maid Mixed Fruit Juice,Mixed Fruit,2015-04-23,1.00,...,67.400000,33.366667,E-commerce,520661.105099,Low,3.111607,2810,6,1,0
4,2023-01-01,CT001,SKU1007,E Commerce,161,12075.0,Minute Maid 250ml Pulpy Orange,Orange,2015-04-23,0.25,...,67.400000,33.366667,E-commerce,520661.105099,Low,3.111607,2810,6,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-01-03,CT007,SKU1001,E Commerce,229,68.7,Minute Maid Apple Juice - Honey Infused,Apple,2022-02-19,1.00,...,67.400000,33.366667,E-commerce,341307.248214,High,3.062574,318,1,1,0
996,2023-01-03,CT003,SKU1004,Modern Trade,95,28.5,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,73.400000,50.000000,E-commerce,194478.497198,High,3.026316,2812,1,1,0
997,2023-01-03,CT006,SKU1004,Q Commerce,85,25.5,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,69.000000,50.000000,E-commerce,457865.997231,Low,3.069527,2812,1,1,0
998,2023-01-03,CT008,SKU1004,Modern Trade,209,62.7,Minute Maid Pulpy Orange,Orange,2015-04-23,1.00,...,73.400000,50.000000,E-commerce,379895.103012,High,3.033946,2812,1,1,0


In [8]:
merged.to_csv('../../data/training/training_feature_table.csv', index=False)