In [1]:
import pandas as pd

In [2]:
# load data
ifood_data = pd.read_csv("data/marketing_campaign.csv", sep=";")
ifood_data.head(2)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0


In [3]:
ifood_data.shape

(2240, 29)

## Creating the Training, Testing, and Validation Datasets

This is dataset with 2240 customer responses towards a marketing campaign whose success rate was only 15%. Therefore, in order to train the model, I decided to use a balanced training dataset. The remaining datasets after getting the training set are then split into a testing set and two validation sets. I have taken this approach for demonstration purposes given the limited nature of the dataset. This may also mean that our model performance may be swayed in either directions. After confirming the data integrity, the datasets are saved.

In [4]:
# check success rate
ifood_data['Response'].value_counts(normalize=True)

Response
0    0.850893
1    0.149107
Name: proportion, dtype: float64

In [5]:
SEED = 2 

def split_data(original_df, num_0, num_1):
    # get each response value
    response_0 = original_df[original_df['Response']==0]
    response_1 = original_df[original_df['Response']==1]
    
    # randomly split based on the response values
    response_0 = response_0.sample(n=num_0, random_state=SEED)
    response_1 = response_1.sample(n=num_1, random_state=SEED)
    
    # join then shuffle the data 
    train_df = pd.concat([response_0, response_1])
    train_df = train_df.sample(frac=1, random_state=SEED)
    df1_count = train_df['Response'].value_counts().to_list()
    print(f"""first df
    response 0 values: {df1_count[0]} 
    response 1 values: {df1_count[1]}
    """)

    # get remaining data
    remaining_df = original_df.drop(train_df.index)
    df2_count = remaining_df['Response'].value_counts().to_list()
    print(f"""second df
    response 0 values: {df2_count[0]} 
    response 1 values: {df2_count[1]}
    """)

    # check if duplication of indices occurs
    print(f"duplicate indices: {sum(train_df.index.isin(remaining_df))}")
        
    return train_df, remaining_df

In [6]:
# split the ifood dataset to get an equally balanced training data
train_df, remaining_df1 = split_data(ifood_data, 200, 200)

first df
    response 0 values: 200 
    response 1 values: 200
    
second df
    response 0 values: 1706 
    response 1 values: 134
    
duplicate indices: 0


In [7]:
# we then get out testing dataset from the remaining dataset
test_df, remaining_df2 = split_data(remaining_df1, 580, 51)

first df
    response 0 values: 580 
    response 1 values: 51
    
second df
    response 0 values: 1126 
    response 1 values: 83
    
duplicate indices: 0


In [8]:
# finally, we get two validation datasets
val_df1, val_df2 = split_data(remaining_df2, 563, 43)

first df
    response 0 values: 563 
    response 1 values: 43
    
second df
    response 0 values: 563 
    response 1 values: 40
    
duplicate indices: 0


In [9]:
# saving the datasets

# train_df.to_csv("data/train_df.csv", index=False)
# test_df.to_csv("data/test_df.csv", index=False)
# val_df1.to_csv("data/val_df1.csv", index=False)
# val_df2.to_csv("data/val_df2.csv", index=False)