## **Encoding**

All variables need to be numeric before modeling.


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/martinwg/ISA591/main/data/pva97nk.csv')

In [None]:
df.head()

Unnamed: 0,TARGET_B,ID,TARGET_D,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,...,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome
0,0,14974,,2,4,1,3,$17.00,$13.50,$9.25,...,13,A,0,0,,F,U,$0,0,$0
1,0,6294,,1,8,0,3,$20.00,$20.00,$15.88,...,24,A,0,23,67.0,F,U,$186800,85,$0
2,1,46110,$4.00,6,41,3,20,$6.00,$5.17,$3.73,...,22,S,1,0,,M,U,$87600,36,$38750
3,1,185937,$10.00,3,12,3,8,$10.00,$8.67,$8.50,...,16,E,1,0,,M,U,$139200,27,$38942
4,0,29637,,1,1,1,1,$20.00,$20.00,$20.00,...,6,F,0,35,53.0,M,U,$168100,37,$71509


In [None]:
## Skewness
## + means positive skewness (right)
## - means negative skewness (left)
## -1 and 1 should be ok (fairly symmetric)
## >|1| we may need to transform (linear - assumptions, non-linear - outliers and anomalies)
df.select_dtypes(include = "number").skew()

Unnamed: 0,0
TARGET_B,0.0
ID,-0.057613
GiftCnt36,1.288353
GiftCntAll,1.863109
GiftCntCard36,1.172452
GiftCntCardAll,1.331353
GiftTimeLast,-0.778047
GiftTimeFirst,0.195399
PromCnt12,2.873723
PromCnt36,0.261958


In [None]:
## categorical
## # levels
df.select_dtypes('O').nunique()

Unnamed: 0,0
TARGET_D,70
GiftAvgLast,90
GiftAvg36,654
GiftAvgAll,1584
GiftAvgCard36,399
StatusCat96NK,6
DemGender,3
DemHomeOwner,2
DemMedHomeValue,2533
DemMedIncome,4463


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9686 entries, 0 to 9685
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TARGET_B          9686 non-null   int64  
 1   ID                9686 non-null   int64  
 2   TARGET_D          4843 non-null   object 
 3   GiftCnt36         9686 non-null   int64  
 4   GiftCntAll        9686 non-null   int64  
 5   GiftCntCard36     9686 non-null   int64  
 6   GiftCntCardAll    9686 non-null   int64  
 7   GiftAvgLast       9686 non-null   object 
 8   GiftAvg36         9686 non-null   object 
 9   GiftAvgAll        9686 non-null   object 
 10  GiftAvgCard36     7906 non-null   object 
 11  GiftTimeLast      9686 non-null   int64  
 12  GiftTimeFirst     9686 non-null   int64  
 13  PromCnt12         9686 non-null   int64  
 14  PromCnt36         9686 non-null   int64  
 15  PromCntAll        9686 non-null   int64  
 16  PromCntCard12     9686 non-null   int64  


In [None]:
## fix $, change to float or int
df['DemMedIncome']  =  df.DemMedIncome.str.replace('$', '').astype('float')

## TARGET_D, GiftAvgLast, GiftAvg36, GiftAvgAll, GiftAvgCard36, DemMedHomeValue
df['TARGET_D']  =  df.TARGET_D.str.replace('$', '').astype('float')
df['GiftAvgLast']  =  df.GiftAvgLast.str.replace('$', '').astype('float')
df['GiftAvg36']  =  df.GiftAvg36.str.replace('$', '').astype('float')
df['GiftAvgAll']  =  df.GiftAvgAll.str.replace('$', '').astype('float')
df['GiftAvgCard36']  =  df.GiftAvgCard36.str.replace('$', '').astype('float')
df['DemMedHomeValue']  =  df.DemMedHomeValue.str.replace('$', '').astype('float')

In [None]:
## change DemCluster from numeric to object (cluster)
df['DemCluster'] = df.DemCluster.astype('O')

In [None]:
## let's check now unique levels
df.select_dtypes('O').nunique()

Unnamed: 0,0
StatusCat96NK,6
DemCluster,54
DemGender,3
DemHomeOwner,2


In [None]:
## Numeric Variables
## Associations (corr matrix)
## encode first before doing a corr matrix or correlation heatmap

In [None]:
## 3 copies - dummy-encoding, one-hot encoding mixed-encoding
df_dummy = df.copy()
df_onehot = df.copy()
df_mixed = df.copy()

In [None]:
df.StatusCat96NK.value_counts()

Unnamed: 0_level_0,count
StatusCat96NK,Unnamed: 1_level_1
A,5826
S,2365
F,660
N,574
E,227
L,34


In [None]:
## DUMMY ENCODING
## drop one level
df_dummy = pd.get_dummies(df_dummy, drop_first = True, columns = ['StatusCat96NK', 'DemGender', 'DemHomeOwner'])
df_dummy.head()

Unnamed: 0,TARGET_B,ID,TARGET_D,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,...,DemPctVeterans,DemMedIncome,StatusCat96NK_E,StatusCat96NK_F,StatusCat96NK_L,StatusCat96NK_N,StatusCat96NK_S,DemGender_M,DemGender_U,DemHomeOwner_U
0,0,14974,,2,4,1,3,17.0,13.5,9.25,...,0,0.0,False,False,False,False,False,False,False,True
1,0,6294,,1,8,0,3,20.0,20.0,15.88,...,85,0.0,False,False,False,False,False,False,False,True
2,1,46110,4.0,6,41,3,20,6.0,5.17,3.73,...,36,38750.0,False,False,False,False,True,True,False,True
3,1,185937,10.0,3,12,3,8,10.0,8.67,8.5,...,27,38942.0,True,False,False,False,False,True,False,True
4,0,29637,,1,1,1,1,20.0,20.0,20.0,...,37,71509.0,False,True,False,False,False,True,False,True


In [None]:
## ONE-HOT ENCODING
## all levels are encoded
df_onehot = pd.get_dummies(df_onehot, columns = ['StatusCat96NK', 'DemGender', 'DemHomeOwner'])
df_onehot.head()

Unnamed: 0,TARGET_B,ID,TARGET_D,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,...,StatusCat96NK_E,StatusCat96NK_F,StatusCat96NK_L,StatusCat96NK_N,StatusCat96NK_S,DemGender_F,DemGender_M,DemGender_U,DemHomeOwner_H,DemHomeOwner_U
0,0,14974,,2,4,1,3,17.0,13.5,9.25,...,False,False,False,False,False,True,False,False,False,True
1,0,6294,,1,8,0,3,20.0,20.0,15.88,...,False,False,False,False,False,True,False,False,False,True
2,1,46110,4.0,6,41,3,20,6.0,5.17,3.73,...,False,False,False,False,True,False,True,False,False,True
3,1,185937,10.0,3,12,3,8,10.0,8.67,8.5,...,True,False,False,False,False,False,True,False,False,True
4,0,29637,,1,1,1,1,20.0,20.0,20.0,...,False,True,False,False,False,False,True,False,False,True


In [None]:
df.DemHomeOwner.value_counts()  ## dummy-encoding
df.StatusCat96NK.value_counts()  ## label encoding changes the variable to num (1 model df)

Unnamed: 0_level_0,count
StatusCat96NK,Unnamed: 1_level_1
A,5826
S,2365
F,660
N,574
E,227
L,34


In [None]:
## MIXED ENCODING
## want to use different encodings (dummy-encoding, label-encoding)

## label encoding for StatusCat96NK
## dummy-encoding for DemHomeOwner

from sklearn.preprocessing import LabelEncoder

## instance
le = LabelEncoder()

## .fit_transform() creates the label encoded variable
## FOR CATEGORY WE DO NOT WANT TO ENCODE USING LABEL ENCODING (THIS IS JUST AN EXAMPLE)
df_mixed['StatusCat96NK'] = le.fit_transform(df_mixed['StatusCat96NK'])
df_mixed.head()

Unnamed: 0,TARGET_B,ID,TARGET_D,GiftCnt36,GiftCntAll,GiftCntCard36,GiftCntCardAll,GiftAvgLast,GiftAvg36,GiftAvgAll,...,PromCntCardAll,StatusCat96NK,StatusCatStarAll,DemCluster,DemAge,DemGender,DemHomeOwner,DemMedHomeValue,DemPctVeterans,DemMedIncome
0,0,14974,,2,4,1,3,17.0,13.5,9.25,...,13,0,0,0,,F,U,0.0,0,0.0
1,0,6294,,1,8,0,3,20.0,20.0,15.88,...,24,0,0,23,67.0,F,U,186800.0,85,0.0
2,1,46110,4.0,6,41,3,20,6.0,5.17,3.73,...,22,5,1,0,,M,U,87600.0,36,38750.0
3,1,185937,10.0,3,12,3,8,10.0,8.67,8.5,...,16,1,1,0,,M,U,139200.0,27,38942.0
4,0,29637,,1,1,1,1,20.0,20.0,20.0,...,6,2,0,35,53.0,M,U,168100.0,37,71509.0


In [None]:
df_mixed = pd.get_dummies(df_mixed, columns = ['DemHomeOwner', "DemGender"])

In [None]:
df_mixed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9686 entries, 0 to 9685
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TARGET_B          9686 non-null   int64  
 1   ID                9686 non-null   int64  
 2   TARGET_D          4843 non-null   float64
 3   GiftCnt36         9686 non-null   int64  
 4   GiftCntAll        9686 non-null   int64  
 5   GiftCntCard36     9686 non-null   int64  
 6   GiftCntCardAll    9686 non-null   int64  
 7   GiftAvgLast       9686 non-null   float64
 8   GiftAvg36         9686 non-null   float64
 9   GiftAvgAll        9686 non-null   float64
 10  GiftAvgCard36     7906 non-null   float64
 11  GiftTimeLast      9686 non-null   int64  
 12  GiftTimeFirst     9686 non-null   int64  
 13  PromCnt12         9686 non-null   int64  
 14  PromCnt36         9686 non-null   int64  
 15  PromCntAll        9686 non-null   int64  
 16  PromCntCard12     9686 non-null   int64  


In [None]:
## Let's use the dummy-encoding
## Still need to fix DemCluster (object - dim reduction)
df_dummy.to_csv('vet_data_clean.csv', index = False)