In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy.stats import kurtosis, skew
import math
import random
import time
import datetime

In [2]:
df = pd.read_csv('https://github.com/kaopanboonyuen/Python-Data-Science/raw/master/Dataset/hed2020_dataset.csv')

In [3]:
df.shape

(5824, 24)

In [4]:
df.isna().sum()

id                               0
label                           60
cap-shape                        0
cap-surface                     27
bruises                         99
odor                            99
gill-attachment                 99
gill-spacing                   130
gill-size                      121
stalk-shape                    121
stalk-root                      31
stalk-surface-above-ring        31
stalk-surface-below-ring        31
veil-type                       62
ring-number                     62
ring-type                       62
spore-print-color               56
population                      56
habitat                         31
cap-color-rate                  27
gill-color-rate                121
veil-color-rate                 62
stalk-color-above-ring-rate     31
stalk-color-below-ring-rate     62
dtype: int64

In [5]:
df = df[df['label'].notna()]
df.shape

(5764, 24)

In [6]:
drop_list = ['id', 'gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate',
             'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 
             'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type']
df = df.drop(drop_list,axis=1)

In [7]:
df.shape

(5764, 12)

In [8]:
df.isna().sum()

label                  0
cap-shape              0
cap-surface           27
bruises               99
odor                  99
stalk-shape          121
ring-number           62
ring-type             62
spore-print-color     56
population            56
habitat               31
cap-color-rate        27
dtype: int64

In [9]:
df.fillna(df.mean(), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

  df.fillna(df.mean(), inplace=True)


In [10]:
df.isna().sum()

label                0
cap-shape            0
cap-surface          0
bruises              0
odor                 0
stalk-shape          0
ring-number          0
ring-type            0
spore-print-color    0
population           0
habitat              0
cap-color-rate       0
dtype: int64

In [11]:
df = df.replace({"label": {"e": 1, "p": 0}})

In [12]:
print('p:' + str(df['label'].count() - df['label'].sum()))
print('e:' + str(df['label'].sum()))

p:3660
e:2104


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   int64  
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5764 non-null   object 
 3   bruises            5764 non-null   object 
 4   odor               5764 non-null   object 
 5   stalk-shape        5764 non-null   object 
 6   ring-number        5764 non-null   object 
 7   ring-type          5764 non-null   object 
 8   spore-print-color  5764 non-null   object 
 9   population         5764 non-null   object 
 10  habitat            5764 non-null   object 
 11  cap-color-rate     5764 non-null   float64
dtypes: float64(1), int64(1), object(10)
memory usage: 585.4+ KB


In [14]:
nominal_columns = ["cap-shape", "cap-surface", "bruises", "odor", "stalk-shape", "ring-number",
                   "ring-type", "spore-print-color", "population", "habitat"]
dummy_df = pd.get_dummies(df[nominal_columns], drop_first=True) 
tw_dummy_df = pd.concat([df, dummy_df], axis=1)
tw_dummy_df = tw_dummy_df.drop(nominal_columns, axis=1)
print(tw_dummy_df.shape)

(5764, 43)


In [15]:
from sklearn.model_selection import train_test_split
random.seed(2020)
y = tw_dummy_df.pop('label')
X = tw_dummy_df
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.2,random_state=2020)

In [16]:
print(X_train.shape, X_test.shape)

(4611, 42) (1153, 42)


In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rfc=RandomForestClassifier(random_state=2020)
param_grid = { 
    'criterion':['gini','entropy'],
    'max_depth': [2,3,6],
    'min_samples_leaf':[2,5,10],
    'n_estimators':[100,200],
    'random_state': [2020]
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

In [18]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 6,
 'min_samples_leaf': 2,
 'n_estimators': 100,
 'random_state': 2020}

In [26]:
model = CV_rfc.best_estimator_
y_pred = model.predict(X_test)

In [31]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(y_test, y_pred, labels=[0, 1]))

[[731   1]
 [  5 416]]


In [32]:
print(classification_report(y_test, y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.9932    0.9986    0.9959       732
           1     0.9976    0.9881    0.9928       421

    accuracy                         0.9948      1153
   macro avg     0.9954    0.9934    0.9944      1153
weighted avg     0.9948    0.9948    0.9948      1153

