### Import Libraries and load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('marketing_campaign.csv', sep = '\t')

In [3]:
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


### Drop duplicates and NAs

In [4]:
data_new = data.copy()
data_new.drop_duplicates(inplace = True)

In [5]:
data_new.info()

data_new['Complain'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

Complain
0    2219
1      21
Name: count, dtype: int64

In [6]:
data_new.dropna(inplace= True)

In [7]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2216 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2216 non-null   int64  
 1   Year_Birth           2216 non-null   int64  
 2   Education            2216 non-null   object 
 3   Marital_Status       2216 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2216 non-null   int64  
 6   Teenhome             2216 non-null   int64  
 7   Dt_Customer          2216 non-null   object 
 8   Recency              2216 non-null   int64  
 9   MntWines             2216 non-null   int64  
 10  MntFruits            2216 non-null   int64  
 11  MntMeatProducts      2216 non-null   int64  
 12  MntFishProducts      2216 non-null   int64  
 13  MntSweetProducts     2216 non-null   int64  
 14  MntGoldProds         2216 non-null   int64  
 15  NumDealsPurchases    2216 non-null   int64 

### Change data types and create new columns

In [8]:
data_new['Dt_Customer'] = pd.to_datetime(data_new['Dt_Customer'], format = "%d-%m-%Y")

In [9]:
data_new['Dependants'] = data_new['Kidhome'] + data_new['Teenhome'] # For EDA analysis purpose

In [10]:
data_new['Age'] = 2014 - data_new['Year_Birth']

### Drop outliers in Age and Income columns, and absurd values in Marital Status

In [11]:
data_new = data_new[(data_new["Age"]<90)]
data_new = data_new[(data_new["Income"]<600000)]
data_new = data_new[~data_new['Marital_Status'].isin(['Absurd', 'Alone', 'YOLO'])]
#Absurd,YOLO were not clear about what their status is and Alone was not clear if it should be included in Singles or divorced.
#However,  they didn't include much of info and only a few of rows were dropped

### Create sub-dataframes for EDA and clusters analysis

In [12]:
data_campain = data_new[['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Z_CostContact', 'Z_Revenue', 'Response']]

In [13]:
num_purchases = data_new[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']]

In [14]:
data_monetary = data_new[['MntWines', 
             'MntFruits',
             'MntMeatProducts', 
             'MntFishProducts', 
             'MntSweetProducts', 
             'MntGoldProds']]

### Drop columns for the basic scenario

In [15]:
data_new.drop(columns= ['ID', 'Year_Birth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 
                        'AcceptedCmp2', 'Z_CostContact', 'Z_Revenue', 'Response', 'NumDealsPurchases', 
                        'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Complain',
                       'Dt_Customer', 'Recency'], inplace=True)

# Inititaly, we dediced to keep demographics and the amount of expenses per category, as they seemed more useful for the clustering purpose

In [16]:
data_new.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,Dependants,Age
0,Graduation,Single,58138.0,0,0,635,88,546,172,88,88,0,57
1,Graduation,Single,46344.0,1,1,11,1,6,2,1,6,2,60
2,Graduation,Together,71613.0,0,0,426,49,127,111,21,42,0,49
3,Graduation,Together,26646.0,1,0,11,4,20,10,3,5,1,30
4,PhD,Married,58293.0,1,0,173,43,118,46,27,15,1,33


### One hot encoding

In [17]:
data_encoded = pd.get_dummies(data_new, columns= ['Education', 'Marital_Status'])

In [18]:
data_encoded

Unnamed: 0,Income,Kidhome,Teenhome,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,Dependants,Age,Education_2n Cycle,Education_Basic,Education_Graduation,Education_Master,Education_PhD,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow
0,58138.0,0,0,635,88,546,172,88,88,0,57,False,False,True,False,False,False,False,True,False,False
1,46344.0,1,1,11,1,6,2,1,6,2,60,False,False,True,False,False,False,False,True,False,False
2,71613.0,0,0,426,49,127,111,21,42,0,49,False,False,True,False,False,False,False,False,True,False
3,26646.0,1,0,11,4,20,10,3,5,1,30,False,False,True,False,False,False,False,False,True,False
4,58293.0,1,0,173,43,118,46,27,15,1,33,False,False,False,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,61223.0,0,1,709,43,182,42,118,247,1,47,False,False,True,False,False,False,True,False,False,False
2236,64014.0,2,1,406,0,30,0,0,8,3,68,False,False,False,False,True,False,False,False,True,False
2237,56981.0,0,0,908,48,217,32,12,24,0,33,False,False,True,False,False,True,False,False,False,False
2238,69245.0,0,1,428,30,214,80,30,61,1,58,False,False,False,True,False,False,False,False,True,False


In [19]:
data_encoded.to_csv('data_encoded.csv', index=False)
data_new.to_csv('data_new.csv', index=False)
data.to_csv('data.csv', index=False)
data_campain.to_csv('data_campain.csv', index=False)
num_purchases.to_csv('num_purchases.csv', index=False)
data_monetary.to_csv('data_monetary.csv', index=False)