In [8]:
import pandas as pd
from eda_utils import get_null_percentage

In [9]:
consumer_behavior_df = pd.read_csv('../data/42fac176-29cd-40d4-bd77-3c4709063d91_consumerbehavior.csv')
products_df = pd.read_csv("../data/a72d47ec-694f-4744-8ffd-ffd875fe7d30_products.csv")
competitor_df = pd.read_csv("../data/b1e476b3-c4ea-41b6-8957-b1954434dd26_competitivelandscape.csv")
distribution_df = pd.read_csv("../data/e7486a22-3dda-40cc-a091-42c71ead2b81_retailsalesdistribution.csv")
marketing_df = pd.read_csv("../data/df7b0066-eb64-4bfd-86eb-7e54c93882e6_externalmarketinfluencers.csv")
city_df = pd.read_csv("../data/c37e916d-3656-41fe-97da-f0bc860d6711_city.csv")

### Merging city_df and consumer_behavior_df

In [10]:
get_null_percentage(city_df)

City_ID                             0.0
City_Name                           0.0
City_tier                           0.0
Population_Density(persons/km)      0.0
Per_Capita_Income (INR)             0.0
Unnamed: 5                        100.0
Unnamed: 6                        100.0
Unnamed: 7                        100.0
Unnamed: 8                        100.0
Unnamed: 9                        100.0
Unnamed: 10                       100.0
Unnamed: 11                       100.0
Unnamed: 12                       100.0
dtype: float64

In [11]:
# dropped empty column / unnecessary columns
city_df.drop(columns=["Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "Unnamed: 12"], inplace=True)

In [12]:
get_null_percentage(city_df), len(city_df)

(City_ID                           0.0
 City_Name                         0.0
 City_tier                         0.0
 Population_Density(persons/km)    0.0
 Per_Capita_Income (INR)           0.0
 dtype: float64,
 20)

In [13]:
get_null_percentage(consumer_behavior_df), len(consumer_behavior_df)

(Customer_ID           0.000
 Age_Group             0.000
 Income_Level          0.000
 City_ID               0.000
 Preferred_Flavor      0.000
 Purchase_Frequency    4.976
 Price_Sensitivity     0.000
 Preferred_Channel     0.000
 Active                0.000
 dtype: float64,
 50000)

In [14]:
consumer_behavior_df[consumer_behavior_df['Purchase_Frequency'].isnull()]

Unnamed: 0,Customer_ID,Age_Group,Income_Level,City_ID,Preferred_Flavor,Purchase_Frequency,Price_Sensitivity,Preferred_Channel,Active
49,cust_000050,18-25,503117.231695,CT017,Mixed Fruit,,Low,E-commerce,False
86,cust_000087,36-45,891863.316358,CT007,Mixed Fruit,,Low,Modern Trade,False
93,cust_000094,46-55,151474.652644,CT019,Mixed Fruit,,High,Modern Trade,True
127,cust_000128,46-55,164625.361879,CT003,Orange,,Low,General Trade,False
134,cust_000135,18-25,87430.053326,CT010,Apple,,Medium,E-commerce,False
...,...,...,...,...,...,...,...,...,...
49881,cust_049882,55+,184569.895496,CT016,Guava,,Low,General Trade,True
49907,cust_049908,46-55,180924.727277,CT003,Apple,,High,General Trade,True
49947,cust_049948,18-25,88678.249637,CT014,Mixed Fruit,,Medium,Q-commerce,True
49951,cust_049952,36-45,725479.425574,CT012,Mixed Fruit,,Low,Q-commerce,False


In [15]:
consumer_df = pd.merge(consumer_behavior_df, city_df, on="City_ID", how="left")

In [16]:
get_null_percentage(consumer_df), len(consumer_df)

(Customer_ID                       0.000
 Age_Group                         0.000
 Income_Level                      0.000
 City_ID                           0.000
 Preferred_Flavor                  0.000
 Purchase_Frequency                4.976
 Price_Sensitivity                 0.000
 Preferred_Channel                 0.000
 Active                            0.000
 City_Name                         0.000
 City_tier                         0.000
 Population_Density(persons/km)    0.000
 Per_Capita_Income (INR)           0.000
 dtype: float64,
 50000)

In [18]:
consumer_df.to_csv("../intermediate_data/consumer_city.csv", index=False)

### Merging city_df and marketing_df

In [19]:
get_null_percentage(marketing_df), len(marketing_df)

(Week_Start_Date     0.000000
 City_ID             0.000000
 Avg_Temperature     0.000000
 Weather_Type        0.000000
 Festival           94.285714
 dtype: float64,
 2100)

In [20]:
marketing_df['Festival'].unique()

array([nan, 'Pongal', 'Holi', 'Diwali'], dtype=object)

In [21]:
marketing_df[marketing_df['Festival'].notnull()]

Unnamed: 0,Week_Start_Date,City_ID,Avg_Temperature,Weather_Type,Festival
20,2023-01-09,CT001,20.6,Cold,Pongal
21,2023-01-09,CT002,20.2,Cold,Pongal
22,2023-01-09,CT003,20.1,Cold,Pongal
23,2023-01-09,CT004,20.7,Cold,Pongal
24,2023-01-09,CT005,20.7,Cold,Pongal
...,...,...,...,...,...
1915,2024-10-28,CT016,24.9,Mild,Diwali
1916,2024-10-28,CT017,24.9,Mild,Diwali
1917,2024-10-28,CT018,26.3,Mild,Diwali
1918,2024-10-28,CT019,25.5,Mild,Diwali


In [22]:
city_marketing_df = pd.merge(marketing_df, city_df, on="City_ID", how="left")

In [23]:
city_marketing_df

Unnamed: 0,Week_Start_Date,City_ID,Avg_Temperature,Weather_Type,Festival,City_Name,City_tier,Population_Density(persons/km),Per_Capita_Income (INR)
0,2023-01-02,CT001,21.0,Cold,,Delhi,Tier 1,14893,461910
1,2023-01-02,CT002,19.4,Cold,,Mumbai,Tier 1,20518,400000
2,2023-01-02,CT003,19.8,Cold,,Kolkata,Tier 1,24252,171184
3,2023-01-02,CT004,20.8,Cold,,Chennai,Tier 1,14456,585501
4,2023-01-02,CT005,21.6,Cold,,Bengaluru,Tier 1,4378,352000
...,...,...,...,...,...,...,...,...,...
2095,2024-12-30,CT016,19.6,Cold,,Nagpur,Tier 2,9664,296607
2096,2024-12-30,CT017,20.3,Cold,,Surat,Tier 2,8617,389810
2097,2024-12-30,CT018,18.7,Cold,,Patna,Tier 2,2805,173305
2098,2024-12-30,CT019,19.3,Cold,,Ludhiana,Tier 2,9750,123882


In [24]:
get_null_percentage(city_marketing_df), len(city_marketing_df)

(Week_Start_Date                    0.000000
 City_ID                            0.000000
 Avg_Temperature                    0.000000
 Weather_Type                       0.000000
 Festival                          94.285714
 City_Name                          0.000000
 City_tier                          0.000000
 Population_Density(persons/km)     0.000000
 Per_Capita_Income (INR)            0.000000
 dtype: float64,
 2100)

In [26]:
city_marketing_df.to_csv("../intermediate_data/marketing_city.csv", index=False)

### Merging retail sales with products and city data

In [27]:
products_df.rename(columns={"SKU Identification Number": "SKU_ID"}, inplace=True)

get_null_percentage(products_df), len(products_df)

(Product Name                0.0
 Flavor Variant              0.0
 SKU_ID                      0.0
 Launch Date (MM/DD/YYYY)    0.0
 Pack Size (ml/L)            0.0
 Distribution Coverage       0.0
 dtype: float64,
 7)

In [28]:
get_null_percentage(distribution_df), len(distribution_df)

(Date          0.0
 City_ID       0.0
 SKU_ID        0.0
 Channel       0.0
 Units_Sold    0.0
 Sales         0.0
 dtype: float64,
 334300)

In [29]:
product_retail_sales_df = pd.merge(pd.merge(distribution_df, products_df, on="SKU_ID", how="left"), city_df, on="City_ID", how="left")

In [30]:
product_retail_sales_df

Unnamed: 0,Date,City_ID,SKU_ID,Channel,Units_Sold,Sales,Product Name,Flavor Variant,Launch Date (MM/DD/YYYY),Pack Size (ml/L),Distribution Coverage,City_Name,City_tier,Population_Density(persons/km),Per_Capita_Income (INR)
0,2023-01-01,CT001,SKU1002,Q Commerce,268,80.4,Minute Maid Mixed Fruit Juice,Mixed Fruit,04/23/2015,1 L,General Trade-Retail; E-commerce; Q-Commerce; ...,Delhi,Tier 1,14893,461910
1,2023-01-01,CT001,SKU1004,Q Commerce,168,50.4,Minute Maid Pulpy Orange,Orange,04/23/2015,1 L,General Trade-Retail; E-commerce; Q-Commerce; ...,Delhi,Tier 1,14893,461910
2,2023-01-01,CT001,SKU1001,E Commerce,521,156.3,Minute Maid Apple Juice - Honey Infused,Apple,02/19/2022,1 L,General Trade-Retail; E-commerce,Delhi,Tier 1,14893,461910
3,2023-01-01,CT001,SKU1002,E Commerce,247,74.1,Minute Maid Mixed Fruit Juice,Mixed Fruit,04/23/2015,1 L,General Trade-Retail; E-commerce; Q-Commerce; ...,Delhi,Tier 1,14893,461910
4,2023-01-01,CT001,SKU1007,E Commerce,161,12075.0,Minute Maid 250ml Pulpy Orange,Orange,04/23/2015,250 ml Bottle,General Trade-Retail; E-commerce; HoReCa; Mode...,Delhi,Tier 1,14893,461910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334295,2024-12-31,CT020,SKU1008,General Trade,179,13425.0,Minute Maid 250ml Mixed Fruit Juice,Mixed Fruit,04/23/2015,250 ml Bottle,General Trade-Retail; E-commerce; HoReCa; Mode...,Jammu,Tier 2,3571,186000
334296,2024-12-31,CT020,SKU1002,HoReCa,144,43.2,Minute Maid Mixed Fruit Juice,Mixed Fruit,04/23/2015,1 L,General Trade-Retail; E-commerce; Q-Commerce; ...,Jammu,Tier 2,3571,186000
334297,2024-12-31,CT020,SKU1007,HoReCa,123,9225.0,Minute Maid 250ml Pulpy Orange,Orange,04/23/2015,250 ml Bottle,General Trade-Retail; E-commerce; HoReCa; Mode...,Jammu,Tier 2,3571,186000
334298,2024-12-31,CT020,SKU1004,HoReCa,44,13.2,Minute Maid Pulpy Orange,Orange,04/23/2015,1 L,General Trade-Retail; E-commerce; Q-Commerce; ...,Jammu,Tier 2,3571,186000


In [31]:
get_null_percentage(product_retail_sales_df), len(product_retail_sales_df)

(Date                              0.0
 City_ID                           0.0
 SKU_ID                            0.0
 Channel                           0.0
 Units_Sold                        0.0
 Sales                             0.0
 Product Name                      0.0
 Flavor Variant                    0.0
 Launch Date (MM/DD/YYYY)          0.0
 Pack Size (ml/L)                  0.0
 Distribution Coverage             0.0
 City_Name                         0.0
 City_tier                         0.0
 Population_Density(persons/km)    0.0
 Per_Capita_Income (INR)           0.0
 dtype: float64,
 334300)

In [32]:
product_retail_sales_df.to_csv("../intermediate_data/retail_sales_city_product.csv", index=False)

### Exploring competitive data

In [34]:
get_null_percentage(competitor_df), len(competitor_df)

(Date               0.0
 Channel            0.0
 Brand              0.0
 Mentions_Count     0.0
 Sentiment_Score    0.0
 Share_of_Voice     0.0
 dtype: float64,
 1260)

In [37]:
competitor_df

Unnamed: 0,Date,Channel,Brand,Mentions_Count,Sentiment_Score,Share_of_Voice
0,2023-01-02,E Commerce,Amazon Solimo,156,61.3,15.0
1,2023-01-02,E Commerce,Minute Maid,346,70.0,33.2
2,2023-01-02,E Commerce,Real Fruit Juice,542,70.9,51.9
3,2023-01-02,General Trade,Minute Maid,551,72.4,53.8
4,2023-01-02,General Trade,Paper Boat,474,67.0,46.2
...,...,...,...,...,...,...
1255,2024-12-30,HoReCa,Paper Boat,232,65.3,31.6
1256,2024-12-30,Modern Trade,B Natural,247,69.8,40.8
1257,2024-12-30,Modern Trade,Minute Maid,359,69.4,59.2
1258,2024-12-30,Q Commerce,Minute Maid,430,60.5,37.4


In [35]:
competitor_df.describe()

Unnamed: 0,Mentions_Count,Sentiment_Score,Share_of_Voice
count,1260.0,1260.0,1260.0
mean,409.838889,70.063571,41.667302
std,207.061264,5.547975,12.816972
min,101.0,50.1,13.4
25%,255.0,66.6,31.1
50%,367.0,70.1,40.1
75%,521.25,73.8,51.7
max,1575.0,86.1,67.6
