In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# load data
df = pd.read_csv("marketing_campaign.csv", sep=";")
df.head(2)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0


In [3]:
df.shape

(2240, 29)

In [4]:
# remove irrelevant columns given that they have only a common value
print(f"Unique values for Z_CostContact: {df['Z_CostContact'].unique()[0]}")
print(f"Unique values for Z_Revenue: {df['Z_Revenue'].unique()[0]}")

df = df.drop(['Z_CostContact', 'Z_Revenue'], axis=1)

Unique values for Z_CostContact: 3
Unique values for Z_Revenue: 11


In [5]:
df.shape

(2240, 27)

## Creating the Training, Testing, and Validation Datasets

This is dataset with 2240 customer responses towards a marketing campaign whose success rate was only 15%. This already shows an imbalanced dataset but we will just randomly split it into three parts: training, testing, and validation sets.

In [6]:
# check success rate
df['Response'].value_counts(normalize=True)

0    0.850893
1    0.149107
Name: Response, dtype: float64

### Using Scikit-Learn

In [7]:
# get features
features = df.drop(['Response'], axis=1)

# get target
target = df['Response']

# check shape
features.shape, target.shape

((2240, 26), (2240,))

In [8]:
# split data (85%, 11.25%, 3.75%
SEED = 0
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.15, random_state=SEED)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=SEED)

print(X_train.shape, X_test.shape, X_val.shape, X_temp.shape)
print(y_train.shape, y_test.shape, y_val.shape, y_temp.shape)

(1904, 26) (252, 26) (84, 26) (336, 26)
(1904,) (252,) (84,) (336,)


In [9]:
# save data
X_train.to_csv('training_features.csv', index=False)
X_test.to_csv('testing_features.csv', index=False)
X_val.to_csv('validation_features.csv', index=False)
y_train.to_csv('training_targets.csv', index=False)
y_test.to_csv('testing_targets.csv', index=False)
y_val.to_csv('validation_targets.csv', index=False)

### For a Balanced Dataset

Though the results tend to be poor

In [10]:

def split_data(original_df, num_0, num_1):
    # get each response value
    response_0 = original_df[original_df['Response']==0]
    response_1 = original_df[original_df['Response']==1]
    
    # randomly split based on the response values
    response_0 = response_0.sample(n=num_0, random_state=SEED)
    response_1 = response_1.sample(n=num_1, random_state=SEED)
    
    # join then shuffle the data 
    train_df = pd.concat([response_0, response_1])
    train_df = train_df.sample(frac=1, random_state=SEED)
    df1_count = train_df['Response'].value_counts().to_list()
    print(f"""first df
    response 0 values: {df1_count[0]} 
    response 1 values: {df1_count[1]}
    """)

    # get remaining data
    remaining_df = original_df.drop(train_df.index)
    df2_count = remaining_df['Response'].value_counts().to_list()
    print(f"""second df
    response 0 values: {df2_count[0]} 
    response 1 values: {df2_count[1]}
    """)

    # check if duplication of indices occurs
    print(f"duplicate indices: {sum(train_df.index.isin(remaining_df))}")
        
    return train_df, remaining_df

In [11]:
# split the ifood dataset to get an equally balanced training data
train_df, remaining_df1 = split_data(df, 200, 200)

first df
    response 0 values: 200 
    response 1 values: 200
    
second df
    response 0 values: 1706 
    response 1 values: 134
    
duplicate indices: 0


In [12]:
# we then get out testing dataset from the remaining dataset
test_df, remaining_df2 = split_data(remaining_df1, 580, 51)

first df
    response 0 values: 580 
    response 1 values: 51
    
second df
    response 0 values: 1126 
    response 1 values: 83
    
duplicate indices: 0


In [13]:
# finally, we get two validation datasets
val_df1, val_df2 = split_data(remaining_df2, 563, 43)

first df
    response 0 values: 563 
    response 1 values: 43
    
second df
    response 0 values: 563 
    response 1 values: 40
    
duplicate indices: 0


In [14]:
# saving the datasets

# train_df.to_csv("data/train_df.csv", index=False)
# test_df.to_csv("data/test_df.csv", index=False)
# val_df1.to_csv("data/val_df1.csv", index=False)
# val_df2.to_csv("data/val_df2.csv", index=False)