# 1. Primary data analysis


## 1.1 Connecting libraries and importing data


In [1]:
%run "../../Oleksandr Zakharchuk Handbook.ipynb"

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

In [3]:
#!pip install category_encoders

In [4]:
df_xp = pd.read_csv('Mushroom Data Set/Data Folder/expanded.data', header=None, names=[
    "Classification",
    "cap-shape",
    "cap-surface",
    "cap-color",
    "bruises?",
    "odor",
    "gill-attachment",
    "gill-spacing",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "stalk-surface-above-ring",
    "stalk-surface-below-ring",
    "stalk-color-above-ring",
    "stalk-color-below-ring",
    "veil-type",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "population",
    "habitat"
])

## 1.2 General information


In [5]:
df_xp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8416 entries, 0 to 8415
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Classification            8416 non-null   object
 1   cap-shape                 8416 non-null   object
 2   cap-surface               8416 non-null   object
 3   cap-color                 8416 non-null   object
 4   bruises?                  8416 non-null   object
 5   odor                      8416 non-null   object
 6   gill-attachment           8416 non-null   object
 7   gill-spacing              8416 non-null   object
 8   gill-size                 8416 non-null   object
 9   gill-color                8416 non-null   object
 10  stalk-shape               8416 non-null   object
 11  stalk-root                8416 non-null   object
 12  stalk-surface-above-ring  8416 non-null   object
 13  stalk-surface-below-ring  8416 non-null   object
 14  stalk-color-above-ring  

In [6]:
df_xp

Unnamed: 0,Classification,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
3,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
4,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES
8412,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES
8413,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES
8414,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES


Classification column is a target, everything else is a feature


All columns are categorical


## 2. Type conversion and value adjustment


In [7]:
get_count_by_column(df_xp, 0)
get_count_by_column(df_xp, np.nan)
get_count_by_column(df_xp, None)
get_count_by_column(df_xp, [np.inf, -np.inf])

Search 0 value:  {}
Search nan value:  {}
Search None value:  {}
Search [inf, -inf] value:  {}


All data is correct


# 3. Coding 

## 3.1 Encoders without target binding

### 3.1.1 OneHotEncoder

In [8]:
enc = OneHotEncoder(handle_unknown='ignore')
X = df_xp['bruises?']

In [9]:
enc.fit(X)

In [10]:
enc.transform(X)

Unnamed: 0,bruises?_1,bruises?_2
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
8411,0,1
8412,0,1
8413,0,1
8414,0,1


We have two columns: in the 'bruises?_1' column, the unit corresponds to the BRUISES value, and in the 'bruises?_2' column, the unit corresponds to the NO value


In [11]:
enc.get_feature_names_out()

array(['bruises?_1', 'bruises?_2'], dtype=object)

### 3.1.2 LabelEncoder

#### 3.1.1 Encode one column


In [12]:
label = LabelEncoder()
df_le = df_xp.copy()
df_le['bruises?_le'] = label.fit_transform(df_le['bruises?'])
df_le

Unnamed: 0,Classification,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,bruises?_le
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,0
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,0
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,0
3,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,0
4,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,...,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES,1
8412,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES,1
8413,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES,1
8414,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES,1


We see a new column 'bruises?_le' at the end


#### 3.1.1 Encode the entire dataset


In [13]:
le_df_xp = df_xp.apply(LabelEncoder().fit_transform)
le_df_xp

Unnamed: 0,Classification,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,6,4,6
1,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,1,4,6
2,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,6,4,6
3,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,1,4,6
4,0,2,3,8,0,0,1,1,1,1,...,3,7,7,0,2,1,4,6,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,0,4,3,0,1,6,0,0,0,1,...,3,4,4,0,0,1,4,1,1,1
8412,0,4,3,0,1,6,0,0,0,1,...,3,4,4,0,0,1,4,5,4,1
8413,0,4,3,0,1,6,0,0,0,1,...,3,4,4,0,0,1,4,5,1,1
8414,0,4,3,0,1,6,0,0,0,1,...,3,4,4,0,0,1,4,2,4,1


### 3.1.3 get_dummies

In [14]:
dummies = pd.get_dummies(df_xp, columns=['bruises?'], sparse=True)
dummies

Unnamed: 0,Classification,cap-shape,cap-surface,cap-color,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,bruises?_BRUISES,bruises?_NO
0,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,1,0
1,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,1,0
2,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,1,0
3,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,1,0
4,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,BROWN,TAPERING,...,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES,0,1
8412,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES,0,1
8413,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES,0,1
8414,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES,0,1


We have two new columns: 'bruises?_BRUISES' and 'bruises?_NO'. If you set drop_first=True then the 'bruises?_BRUISES' column will be dropped


### 3.1.4 SumEncoder

Let's take a look at the number of unique values ​​for the 'bruises?' column.


In [15]:
df_xp['bruises?'].value_counts()

NO         5040
BRUISES    3376
Name: bruises?, dtype: int64

Only two unique values


In [16]:
SE_encoder = SumEncoder(['bruises?'])
df_se = SE_encoder.fit_transform(df_xp[['bruises?']], df_xp['Classification'])
df_se



Unnamed: 0,intercept,bruises?_0
0,1,1.0
1,1,1.0
2,1,1.0
3,1,1.0
4,1,1.0
...,...,...
8411,1,-1.0
8412,1,-1.0
8413,1,-1.0
8414,1,-1.0


### 3.1.5 HelmertEncoder

In [17]:
HE_encoder = HelmertEncoder(['bruises?'])  # OHE and Sum Encoding.
df_he = HE_encoder.fit_transform(df_xp[['bruises?']], df_xp['Classification'])
df_he



Unnamed: 0,intercept,bruises?_0
0,1,-1.0
1,1,-1.0
2,1,-1.0
3,1,-1.0
4,1,-1.0
...,...,...
8411,1,1.0
8412,1,1.0
8413,1,1.0
8414,1,1.0


## 3.2 Target-bound encoders


### 3.2.1 Target preparation


Pre-encode the target:


In [18]:
label_target_df_xp = df_xp.copy()

In [19]:
label_target = LabelEncoder()
label_target_df_xp['Classification'] = label_target.fit_transform(label_target_df_xp['Classification'])

### 3.2.2 TargetEncoder

Let's start coding the TargetEncoder:


In [20]:
TE_encoder = TargetEncoder()
df_te = TE_encoder.fit_transform(label_target_df_xp[['bruises?']], label_target_df_xp['Classification'])
df_te

Unnamed: 0,bruises?
0,0.184834
1,0.184834
2,0.184834
3,0.184834
4,0.184834
...,...
8411,0.655556
8412,0.655556
8413,0.655556
8414,0.655556


### 3.2.3 MEstimateEncoder

In [21]:
MEE_encoder = MEstimateEncoder()  # Simplified version of Target Encoder
df_mee = MEE_encoder.fit_transform(label_target_df_xp[['bruises?']], label_target_df_xp['Classification'])
df_mee

Unnamed: 0,bruises?
0,0.184918
1,0.184918
2,0.184918
3,0.184918
4,0.184918
...,...
8411,0.655518
8412,0.655518
8413,0.655518
8414,0.655518


### 3.2.4 WOEEncoder

In [22]:
WOE_encoder = WOEEncoder()   # Weight Of Evidence is a commonly used target-based encoder in credit scoring.
df_woe = WOE_encoder.fit_transform(label_target_df_xp[['bruises?']], label_target_df_xp['Classification']) # for binary classification only
df_woe

Unnamed: 0,bruises?
0,-1.349482
1,-1.349482
2,-1.349482
3,-1.349482
4,-1.349482
...,...
8411,0.776490
8412,0.776490
8413,0.776490
8414,0.776490


### 3.2.5 JamesSteinEncoder

In [23]:
JSE_encoder = JamesSteinEncoder()
df_jse = JSE_encoder.fit_transform(label_target_df_xp[['bruises?']], label_target_df_xp['Classification'])
df_jse

Unnamed: 0,bruises?
0,0.184834
1,0.184834
2,0.184834
3,0.184834
4,0.184834
...,...
8411,0.655556
8412,0.655556
8413,0.655556
8414,0.655556


### 3.2.6 LeaveOneOutEncoder

In [24]:
LOOE_encoder = LeaveOneOutEncoder()
df_looe = LOOE_encoder.fit_transform(label_target_df_xp[['bruises?']], label_target_df_xp['Classification'])
df_looe

Unnamed: 0,bruises?
0,0.184889
1,0.184889
2,0.184889
3,0.184889
4,0.184889
...,...
8411,0.655686
8412,0.655686
8413,0.655686
8414,0.655686


### 3.2.7 CatBoostEncoder

In [25]:
CBE_encoder = CatBoostEncoder()
df_cbe = CBE_encoder.fit_transform(label_target_df_xp[['bruises?']], label_target_df_xp['Classification'])
df_cbe.head(100)

Unnamed: 0,bruises?
0,0.466730
1,0.233365
2,0.155577
3,0.116683
4,0.093346
...,...
95,0.004862
96,0.004812
97,0.004763
98,0.004714
