# Overview
Exploratory data analysis of the ifood dataset detailing customer spending habits and campaign response patterns.
The general process of EDA is as follows:
- variable type analysis/transformations
- variable distributional analysis
- anomaly detection (outliers, separation issues, missing data)
- association analysis (check for high correlation)

data source: https://www.kaggle.com/datasets/jackdaoud/marketing-data

In [None]:
# import needed packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# read in marketing data
ifood = pd.read_csv('ifood_df.csv')

# Variable Type Analysis

In [3]:
# data conversions
# numeric -> categorical (binary/ordinal indicators)

# convert target variable to binary
ifood['Response'] = ifood['Response'].astype('category')

# list all categorical predictor variables
ifood_cat = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmpOverall',
             'education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD',
             'marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow',
             'Kidhome', 'Teenhome', 'Complain'
            ]

# convert all variables in list to categorical
ifood[ifood_cat] = ifood[ifood_cat].apply(pd.Categorical)

In [4]:
print(ifood.dtypes)

Income                   float64
Kidhome                 category
Teenhome                category
Recency                    int64
MntWines                   int64
MntFruits                  int64
MntMeatProducts            int64
MntFishProducts            int64
MntSweetProducts           int64
MntGoldProds               int64
NumDealsPurchases          int64
NumWebPurchases            int64
NumCatalogPurchases        int64
NumStorePurchases          int64
NumWebVisitsMonth          int64
AcceptedCmp3            category
AcceptedCmp4            category
AcceptedCmp5            category
AcceptedCmp1            category
AcceptedCmp2            category
Complain                category
Z_CostContact              int64
Z_Revenue                  int64
Response                category
Age                        int64
Customer_Days              int64
marital_Divorced        category
marital_Married         category
marital_Single          category
marital_Together        category
marital_Wi

# Variable Distributions

In [5]:
desc = ifood.describe()
print(desc)

              Income      Recency     MntWines    MntFruits  MntMeatProducts  \
count    2205.000000  2205.000000  2205.000000  2205.000000      2205.000000   
mean    51622.094785    49.009070   306.164626    26.403175       165.312018   
std     20713.063826    28.932111   337.493839    39.784484       217.784507   
min      1730.000000     0.000000     0.000000     0.000000         0.000000   
25%     35196.000000    24.000000    24.000000     2.000000        16.000000   
50%     51287.000000    49.000000   178.000000     8.000000        68.000000   
75%     68281.000000    74.000000   507.000000    33.000000       232.000000   
max    113734.000000    99.000000  1493.000000   199.000000      1725.000000   

       MntFishProducts  MntSweetProducts  MntGoldProds  NumDealsPurchases  \
count      2205.000000       2205.000000   2205.000000        2205.000000   
mean         37.756463         27.128345     44.057143           2.318367   
std          54.824635         41.130468     51.

# Variable Anomolies

In [6]:
# check for missing data
missing_values = ifood.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(missing_columns)

Series([], dtype: int64)


In [7]:
# check for separation issues
for var in ifood_cat :
    print(pd.crosstab(ifood['Response'], ifood[var]))

AcceptedCmp1     0   1
Response              
0             1809  63
1              254  79
AcceptedCmp2     0   1
Response              
0             1862  10
1              313  20
AcceptedCmp3     0   1
Response              
0             1786  86
1              256  77
AcceptedCmp4     0    1
Response               
0             1770  102
1              271   62
AcceptedCmp5     0   1
Response              
0             1802  70
1              242  91
AcceptedCmpOverall     0    1   2   3   4
Response                                 
0                   1601  222  39   9   1
1                    146  100  42  35  10
education_2n Cycle     0    1
Response                     
0                   1696  176
1                    311   22
education_Basic     0   1
Response                 
0                1820  52
1                 331   2
education_Graduation    0    1
Response                      
0                     911  961
1                     181  152
education_Master    

In [8]:
# outlier detection

# save all numeric variables as a list
ifood_num = desc.columns.tolist()

# loop through all numeric variables to identify outliers with IQR
for var in ifood_num :
    # calculate 25th and 75th percentile
    Q1 = ifood[var].quantile(0.25)
    Q3 = ifood[var].quantile(0.75)
    IQR = Q3 - Q1

    # define outlier bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # identify outliers
    outliers = ifood[(ifood[var] < lower_bound) | (ifood[var] > upper_bound)]
    print(f"{var} outliers:")
    print(outliers)

Income outliers:
Empty DataFrame
Columns: [Income, Kidhome, Teenhome, Recency, MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds, NumDealsPurchases, NumWebPurchases, NumCatalogPurchases, NumStorePurchases, NumWebVisitsMonth, AcceptedCmp3, AcceptedCmp4, AcceptedCmp5, AcceptedCmp1, AcceptedCmp2, Complain, Z_CostContact, Z_Revenue, Response, Age, Customer_Days, marital_Divorced, marital_Married, marital_Single, marital_Together, marital_Widow, education_2n Cycle, education_Basic, education_Graduation, education_Master, education_PhD, MntTotal, MntRegularProds, AcceptedCmpOverall]
Index: []

[0 rows x 39 columns]
Recency outliers:
Empty DataFrame
Columns: [Income, Kidhome, Teenhome, Recency, MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds, NumDealsPurchases, NumWebPurchases, NumCatalogPurchases, NumStorePurchases, NumWebVisitsMonth, AcceptedCmp3, AcceptedCmp4, AcceptedCmp5, AcceptedCmp1, AcceptedCmp2, Complain, Z_Cos

# Correlations

In [None]:
# idenfity highly correlated variables (above 0.75 or below -0.75)
correlation_matrix = ifood.corr()
pos_threshold = 0.75
neg_threshold = -0.75
high_corr_pairs = correlation_matrix.where(
    ((correlation_matrix >= pos_threshold) | (correlation_matrix <= neg_threshold)) & (correlation_matrix != 1.0)).stack()
print("\nHighly Correlated Pairs:")
print(high_corr_pairs)


Highly Correlated Pairs:
Income               MntTotal               0.823066
                     MntRegularProds        0.816879
MntWines             MntTotal               0.902310
                     MntRegularProds        0.901848
MntMeatProducts      MntTotal               0.861392
                     MntRegularProds        0.860663
NumCatalogPurchases  MntTotal               0.791187
                     MntRegularProds        0.778742
MntTotal             Income                 0.823066
                     MntWines               0.902310
                     MntMeatProducts        0.861392
                     NumCatalogPurchases    0.791187
                     MntRegularProds        0.996569
MntRegularProds      Income                 0.816879
                     MntWines               0.901848
                     MntMeatProducts        0.860663
                     NumCatalogPurchases    0.778742
                     MntTotal               0.996569
dtype: float64
