In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [14]:
DEBUG = False

In [15]:
if DEBUG:
    NROWS = 500000
else:
    NROWS = None

In [16]:
%%time
train = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv', nrows=NROWS)
test = pd.read_csv('../input/porto-seguro-safe-driver-prediction/test.csv', nrows=NROWS)

CPU times: user 6.25 s, sys: 747 ms, total: 6.99 s
Wall time: 7.02 s


In [17]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10,1,-1,0,1,4,1,0,0,1,12,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7,1,0,0,1,11,1,1,3,1,104,1,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11,1,-1,0,-1,14,1,1,2,1,82,3,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0


In [18]:
cat_cols = [col for col in train.columns if 'cat' in col]

In [19]:
for col in cat_cols:
    print(col, train[col].nunique())

ps_ind_02_cat 5
ps_ind_04_cat 3
ps_ind_05_cat 8
ps_car_01_cat 13
ps_car_02_cat 3
ps_car_03_cat 3
ps_car_04_cat 10
ps_car_05_cat 3
ps_car_06_cat 18
ps_car_07_cat 3
ps_car_08_cat 2
ps_car_09_cat 6
ps_car_10_cat 3
ps_car_11_cat 104


In [20]:
train.drop_duplicates()  # 중복 column이 있다면 제거해줌
train.shape

(595212, 59)

In [21]:
test.shape

(892816, 58)

In [22]:
train.info()
# null value는 -1로 치환되었기 때문에 non-null 뿐이다.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              595212 non-null  int64  
 1   target          595212 non-null  int64  
 2   ps_ind_01       595212 non-null  int64  
 3   ps_ind_02_cat   595212 non-null  int64  
 4   ps_ind_03       595212 non-null  int64  
 5   ps_ind_04_cat   595212 non-null  int64  
 6   ps_ind_05_cat   595212 non-null  int64  
 7   ps_ind_06_bin   595212 non-null  int64  
 8   ps_ind_07_bin   595212 non-null  int64  
 9   ps_ind_08_bin   595212 non-null  int64  
 10  ps_ind_09_bin   595212 non-null  int64  
 11  ps_ind_10_bin   595212 non-null  int64  
 12  ps_ind_11_bin   595212 non-null  int64  
 13  ps_ind_12_bin   595212 non-null  int64  
 14  ps_ind_13_bin   595212 non-null  int64  
 15  ps_ind_14       595212 non-null  int64  
 16  ps_ind_15       595212 non-null  int64  
 17  ps_ind_16_

- Metadata

In [23]:
data = []

for f in train.columns:
    # Defining the role
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
        
    # Defining the level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
        
    keep = True
    if f == 'id':
        keep = False
        
    dtype = train[f].dtype
    
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)

In [24]:
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])

In [25]:
meta.set_index('varname', inplace=True)

In [26]:
meta

Unnamed: 0_level_0,role,level,keep,dtype
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id,id,nominal,False,int64
target,target,binary,True,int64
ps_ind_01,input,ordinal,True,int64
ps_ind_02_cat,input,nominal,True,int64
ps_ind_03,input,ordinal,True,int64
ps_ind_04_cat,input,nominal,True,int64
ps_ind_05_cat,input,nominal,True,int64
ps_ind_06_bin,input,binary,True,int64
ps_ind_07_bin,input,binary,True,int64
ps_ind_08_bin,input,binary,True,int64


In [27]:
# nominal 이면서 챙겨갈 인덱스들
meta.loc[(meta.level=='nominal') & (meta.keep)].index

Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object', name='varname')

In [28]:
pd.DataFrame({'count': meta.groupby(['role', 'level'])['role'].size()}).reset_index()

Unnamed: 0,role,level,count
0,id,nominal,1
1,input,binary,17
2,input,interval,10
3,input,nominal,14
4,input,ordinal,16
5,target,binary,1


In [29]:
meta.groupby(['role', 'level'])['role'].size()

role    level   
id      nominal      1
input   binary      17
        interval    10
        nominal     14
        ordinal     16
target  binary       1
Name: role, dtype: int64

In [30]:
v = meta[(meta.level == 'interval') & (meta.keep)].index
train[v].describe()

Unnamed: 0,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,0.610991,0.439184,0.551102,0.379945,0.813265,0.276256,3.065899,0.449756,0.449589,0.449849
std,0.287643,0.404264,0.793506,0.058327,0.224588,0.357154,0.731366,0.287198,0.286893,0.287153
min,0.0,0.0,-1.0,-1.0,0.250619,-1.0,0.0,0.0,0.0,0.0
25%,0.4,0.2,0.525,0.316228,0.670867,0.333167,2.828427,0.2,0.2,0.2
50%,0.7,0.3,0.720677,0.374166,0.765811,0.368782,3.316625,0.5,0.4,0.5
75%,0.9,0.6,1.0,0.4,0.90619,0.396485,3.605551,0.7,0.7,0.7
max,0.9,1.8,4.037945,1.264911,3.720626,0.636396,3.741657,0.9,0.9,0.9


- 선형 모델과 같이 스케일링이 필요한 모델의 경우 StandardScaler로 스케일링 해야한다.

- 트리 모델이나 XGBoost와 같은 모델은 스케일링 하지 않아도 된다.

- 모델의 다양성을 얻고 싶다면 스케일링을 하면 된다.

In [31]:
v = meta[(meta.level == 'ordinal') & (meta.keep)].index
train[v].describe()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_car_11,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,1.900378,4.423318,0.012451,7.299922,2.346072,2.372081,1.885886,7.689445,3.005823,9.225904,2.339034,8.43359,5.441382,1.441918,2.872288,7.539026
std,1.983789,2.699902,0.127545,3.546042,0.832548,1.117219,1.134927,1.334312,1.414564,1.459672,1.246949,2.904597,2.332871,1.202963,1.694887,2.746652
min,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,5.0,2.0,2.0,1.0,7.0,2.0,8.0,1.0,6.0,4.0,1.0,2.0,6.0
50%,1.0,4.0,0.0,7.0,3.0,2.0,2.0,8.0,3.0,9.0,2.0,8.0,5.0,1.0,3.0,7.0
75%,3.0,6.0,0.0,10.0,3.0,3.0,3.0,9.0,4.0,10.0,3.0,10.0,7.0,2.0,4.0,9.0
max,7.0,11.0,4.0,13.0,3.0,5.0,6.0,10.0,9.0,12.0,7.0,25.0,19.0,10.0,13.0,23.0


In [32]:
v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()

Unnamed: 0,target,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,0.036448,0.393742,0.257033,0.163921,0.185304,0.000373,0.001692,0.009439,0.000948,0.660823,0.121081,0.153446,0.122427,0.62784,0.554182,0.287182,0.349024,0.153318
std,0.187401,0.488579,0.436998,0.370205,0.388544,0.019309,0.041097,0.096693,0.030768,0.47343,0.326222,0.360417,0.327779,0.483381,0.497056,0.452447,0.476662,0.360295
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


- train data의 target을 보면 3.645%로 imbalanced!!

## Handling imbalanced classes

- 얼마나 1을 잘 맞추는가가 중요하다!

- 모두 0이라고 하면 정확도가 96.4%가 나오겠지만, 1을 맞출 확률은 0%!

- gini, rocauc를 쓰는 이유!

- 두 가지 방법이 있음

1. oversampleing records with target == 1 // 4 -> 1을 대충 50개로 부풀리기 -- 좋았던 적이 없었다고 함...

2. undersampling records with target == 0 // 96 -> 0을 대충 50개로 줄이기

- **SMOTE 알고리즘**

- 같은 샘플을 12개씩 더 추가한다면 모델이 배울 정보가 늘어난 것은 아니기 때문에

- minor class(target == 1) 주변에 유사한 K-nearest를 찾아서 둘 사이의 어느 값으로 새로운 샘플을 만든다!

In [33]:
desired_apriori = 0.1

In [34]:
idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

In [35]:
nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])
nb_0, nb_1

(573518, 21694)

In [36]:
# target == 0인 것을 unbersampling 하자!

In [37]:
undersampling_rate = ((1 - desired_apriori) * nb_1) / (nb_0 * desired_apriori)
undersampled_nb_0 = int(undersampling_rate * nb_0)
print('Rate to undersample records with target == 0: {}'.format(undersampling_rate))
print('Number of records with target == 0 after undersampling: {}'.format(undersampled_nb_0))

Rate to undersample records with target == 0: 0.34043569687437886
Number of records with target == 0 after undersampling: 195246


In [38]:
undersampled_idx = shuffle(idx_0, random_state=37, n_samples=undersampled_nb_0)

In [39]:
idx_list = list(undersampled_idx) + list(idx_1)

In [40]:
train = train.loc[idx_list].reset_index(drop=True)

- Data Quality Checks

- Checking missing values

- Missing are represented as -1

In [41]:
vars_with_missing = []

for f in train.columns:
    missings = train.loc[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))

Variable ps_ind_02_cat has 103 records (0.05%) with missing values
Variable ps_ind_04_cat has 51 records (0.02%) with missing values
Variable ps_ind_05_cat has 2256 records (1.04%) with missing values
Variable ps_reg_03 has 38580 records (17.78%) with missing values
Variable ps_car_01_cat has 62 records (0.03%) with missing values
Variable ps_car_02_cat has 2 records (0.00%) with missing values
Variable ps_car_03_cat has 148367 records (68.39%) with missing values
Variable ps_car_05_cat has 96026 records (44.26%) with missing values
Variable ps_car_07_cat has 4431 records (2.04%) with missing values
Variable ps_car_09_cat has 230 records (0.11%) with missing values
Variable ps_car_11 has 1 records (0.00%) with missing values
Variable ps_car_14 has 15726 records (7.25%) with missing values
In total, there are 12 variables with missing values


- ps_car_03_cat, ps_car_05_cat은 missing value가 너무 많아서 지우지만, 원래 함부로 지우면 안된다.

- missing value는 원래 함부로 채우면 안된다.(모델 학습 이후에 채우는 라이브러리도 있다.)

- ps_reg_03(continuous)은 평균값으로 채우겠다.

- ps_car_11(ordinal)은 최빈값으로 채우겠다.

- ps_car_12(continuous)은 평균값으로 채우겠다.

- ps_car_14(continuous)은 평균값으로 채우겠다.

In [42]:
train[['ps_car_03_cat', 'target']].groupby('ps_car_03_cat').mean()

Unnamed: 0_level_0,target
ps_car_03_cat,Unnamed: 1_level_1
-1,0.090654
0,0.106983
1,0.128862


- 위에서 보면 -1이 특별한 의미가 없기 때문에 지워도 된다고 판단 가능하다.

In [43]:
# Dropping the variables with too many missing values
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
train.drop(vars_to_drop, inplace=True, axis=1)
meta.loc[(vars_to_drop), 'keep'] = False  # Updating the meta

In [46]:
# Imputing with the mean or mode
# Imputer를 쓰면 결측값 채우기 쉬워서 편해진다!
mean_imp = SimpleImputer(missing_values=-1, strategy='mean')
mode_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')

train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()

In [49]:
mean_imp.fit_transform(train[['ps_reg_03']]).shape

(216940, 1)

In [47]:
mean_imp.fit_transform(train[['ps_reg_03']]).ravel()  # shape을 풀어준다.

array([0.83815273, 0.72844011, 0.86926693, ..., 0.71195154, 0.57716982,
       1.09515981])

- Checking the cardinality of the categorical variables

- 원-핫 인코딩 좋지만 카테고리가 너무 많아지면 column이 너무 많아지는 문제 -> 중요한 feature를 놓칠 수도 있고, 시간이 오래걸릴 수 있는 등 문제가 발생한다.

- mean 인코딩 : target값을 알고 있을 때, 각 카테고리가 가지고 있는 target값의 평균을 취해서(여기서는 target일 확률) 카테고리 값을 결정한다. -> 약점은 과적합 가능성이 높다. -> 일부러 노이즈를 넣어준다.

    원-핫 인코딩보다는 mean 인코딩이 성능이 잘 나올때가 많다.

- frequency 인코딩 : 각 category의 출현 횟수(.size()) / 데이터 총 길이

- **인코딩은 다양하게 시도를 해봐서 최적화를 해야한다!! 다 하고 나서 ensemble 해도 무방하다!**

In [52]:
v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    dist_values = train[f].value_counts().shape[0]
    print('Varialbe {} has {} distinct values'.format(f, dist_values))

Varialbe ps_ind_02_cat has 5 distinct values
Varialbe ps_ind_04_cat has 3 distinct values
Varialbe ps_ind_05_cat has 8 distinct values
Varialbe ps_car_01_cat has 13 distinct values
Varialbe ps_car_02_cat has 3 distinct values
Varialbe ps_car_04_cat has 10 distinct values
Varialbe ps_car_06_cat has 18 distinct values
Varialbe ps_car_07_cat has 3 distinct values
Varialbe ps_car_08_cat has 2 distinct values
Varialbe ps_car_09_cat has 6 distinct values
Varialbe ps_car_10_cat has 3 distinct values
Varialbe ps_car_11_cat has 104 distinct values


In [53]:
train[f].value_counts().shape[0]

104

In [64]:
train.groupby('ps_ind_02_cat').size() / len(train)  # frequency encoding 예시

ps_ind_02_cat
-1    0.000475
 1    0.724274
 2    0.208689
 3    0.047525
 4    0.019038
dtype: float64

In [82]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None,
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):    
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """     
    assert len(trn_series) == len(target)  # True면 넘어가고 False면 Error
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(['mean', 'count'])  # mean()은 하나에서만 가능, 2개 이상 -> agg() 많이 쓰게 됨. 중요!
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages['count'] - min_samples_leaf) / smoothing))  # 좀 더 곡선으로 만들어주는 과정인데, 사실 별 차이 없다.
    prior = target.mean()  # 원래 타겟의 평균값 = 0.1    
    averages[target.name] = prior * (1 - smoothing) + averages['mean'] * smoothing    
    averages.drop(['mean', 'count'], axis=1, inplace=True)
    
    # Apply averages to trn and tst series
    # 원본 데이터에 새로 만든 특성 붙일 때 pd.merge를 이용한다.
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [83]:
train_encoded, test_encoded = target_encode(train['ps_car_11_cat'],
                                            test['ps_car_11_cat'],
                                            target=train.target,
                                            min_samples_leaf=100,
                                            smoothing=10,
                                            noise_level=0.01)

train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat', 'keep'] = False  # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

## EDA