# 2. Data Pre-Processing & Dimensionality Reduction

In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
df = pd.read_csv('../Data/data_clean_for_analysis.csv', encoding = "ISO-8859-1")

In [3]:
df.head(3)

Unnamed: 0,DaysBeingCustomer,Recency,Age,Income,Education,Marital_Status,Kids,MntWines,MntFruits,MntMeatProducts,...,MntGoldProds,NumDealsPurchases,NumStorePurchases,NumCatalogPurchases,NumWebPurchases,NumWebVisitsMonth,TotalProductsL2Y,TotalPurchasesL2Y,AvgProductsPerTrip,SharePurchasesOnDeal
0,1000,58,57,58138.0,Graduate,Single,0,635,88,546,...,88,3,4,10,8,7,1617,22,73.5,13.6
1,154,38,60,46344.0,Graduate,Single,2,11,1,6,...,6,2,2,1,1,5,27,4,6.8,50.0
2,501,26,49,71613.0,Graduate,Together,0,426,49,127,...,42,1,10,2,8,4,776,20,38.8,5.0


### I. Data Pre-Processing

As we have 2 categorical columns in our dataset (`Education` & `Marital_Status`) we need to transform them into ordinal to carry out our models later. I will do so with the method of One Hot Encoder to avoid making the algorithm think there is a specific hierarchy in those variables. 

#### 1. Education

In [4]:
df['Education'].unique()

array(['Graduate', 'Postgraduate', 'Undergraduate'], dtype=object)

In [5]:
labels = ["Ed_" + str(i) for i in list(df['Education'].unique())]

In [6]:
onehotencoder = preprocessing.OneHotEncoder()

In [7]:
education = onehotencoder.fit_transform(df['Education'].values.reshape(-1,1)).toarray()

In [8]:
education

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [9]:
df[labels] = pd.DataFrame(education, index = df.index)

In [10]:
df.sample(3)

Unnamed: 0,DaysBeingCustomer,Recency,Age,Income,Education,Marital_Status,Kids,MntWines,MntFruits,MntMeatProducts,...,NumCatalogPurchases,NumWebPurchases,NumWebVisitsMonth,TotalProductsL2Y,TotalPurchasesL2Y,AvgProductsPerTrip,SharePurchasesOnDeal,Ed_Graduate,Ed_Postgraduate,Ed_Undergraduate
2181,292,18,57,53233.0,Undergraduate,Married,1,28,0,9,...,0,1,4,40,4,10.0,25.0,0.0,0.0,1.0
1889,707,62,29,31158.0,Graduate,Married,1,25,6,16,...,0,2,8,72,5,14.4,40.0,1.0,0.0,0.0
1295,319,14,68,78569.0,Graduate,Married,0,558,79,622,...,6,4,1,1736,14,124.0,7.1,1.0,0.0,0.0


#### 2. Marital Status

In [15]:
df['Marital_Status'].unique()

array(['Single', 'Together', 'Married', 'Divorced', 'Widow'], dtype=object)

In [16]:
labels = ["MS_" + str(i) for i in list(df['Marital_Status'].unique())]

In [17]:
marital_status = onehotencoder.fit_transform(df['Marital_Status'].values.reshape(-1,1)).toarray()

In [18]:
df[labels] = pd.DataFrame(marital_status, index = df.index)

In [19]:
df.sample(2)

Unnamed: 0,DaysBeingCustomer,Recency,Age,Income,Education,Marital_Status,Kids,MntWines,MntFruits,MntMeatProducts,...,AvgProductsPerTrip,SharePurchasesOnDeal,Ed_Graduate,Ed_Postgraduate,Ed_Undergraduate,MS_Single,MS_Together,MS_Married,MS_Divorced,MS_Widow
1108,731,1,31,81698.0,Undergraduate,Single,0,709,45,115,...,73.6,6.7,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1634,693,23,69,69755.0,Graduate,Married,0,217,77,373,...,42.6,4.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
df.drop(columns = ['Education', 'Marital_Status'], inplace = True)

In [22]:
df.sample()

Unnamed: 0,DaysBeingCustomer,Recency,Age,Income,Kids,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,...,AvgProductsPerTrip,SharePurchasesOnDeal,Ed_Graduate,Ed_Postgraduate,Ed_Undergraduate,MS_Single,MS_Together,MS_Married,MS_Divorced,MS_Widow
409,645,1,59,66465.0,1,1200,0,204,38,29,...,46.4,9.4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [23]:
df.dtypes

DaysBeingCustomer         int64
Recency                   int64
Age                       int64
Income                  float64
Kids                      int64
MntWines                  int64
MntFruits                 int64
MntMeatProducts           int64
MntFishProducts           int64
MntSweetProducts          int64
MntGoldProds              int64
NumDealsPurchases         int64
NumStorePurchases         int64
NumCatalogPurchases       int64
NumWebPurchases           int64
NumWebVisitsMonth         int64
TotalProductsL2Y          int64
TotalPurchasesL2Y         int64
AvgProductsPerTrip      float64
SharePurchasesOnDeal    float64
Ed_Graduate             float64
Ed_Postgraduate         float64
Ed_Undergraduate        float64
MS_Single               float64
MS_Together             float64
MS_Married              float64
MS_Divorced             float64
MS_Widow                float64
dtype: object

### II. Dimensionality Reduction - PCA