In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Load data set
df = pd.read_csv('https://raw.githubusercontent.com/pvateekul/2110446_DSDE_2023s2/main/code/Week03_ML/mushroom2020_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5824 entries, 0 to 5823
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5824 non-null   int64  
 1   label                        5764 non-null   object 
 2   cap-shape                    5824 non-null   object 
 3   cap-surface                  5797 non-null   object 
 4   bruises                      5725 non-null   object 
 5   odor                         5725 non-null   object 
 6   gill-attachment              5725 non-null   object 
 7   gill-spacing                 5694 non-null   object 
 8   gill-size                    5703 non-null   object 
 9   stalk-shape                  5703 non-null   object 
 10  stalk-root                   5793 non-null   object 
 11  stalk-surface-above-ring     5793 non-null   object 
 12  stalk-surface-below-ring     5793 non-null   object 
 13  veil-type         

In [4]:
# drop rows with missing label
df = df.dropna(subset=['label'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5764 non-null   int64  
 1   label                        5764 non-null   object 
 2   cap-shape                    5764 non-null   object 
 3   cap-surface                  5737 non-null   object 
 4   bruises                      5665 non-null   object 
 5   odor                         5665 non-null   object 
 6   gill-attachment              5665 non-null   object 
 7   gill-spacing                 5634 non-null   object 
 8   gill-size                    5643 non-null   object 
 9   stalk-shape                  5643 non-null   object 
 10  stalk-root                   5733 non-null   object 
 11  stalk-surface-above-ring     5733 non-null   object 
 12  stalk-surface-below-ring     5733 non-null   object 
 13  veil-type              

In [6]:
# Drop unrelevant variables
df = df.drop(columns=['id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate','stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5737 non-null   object 
 3   bruises            5665 non-null   object 
 4   odor               5665 non-null   object 
 5   stalk-shape        5643 non-null   object 
 6   ring-number        5702 non-null   object 
 7   ring-type          5702 non-null   object 
 8   spore-print-color  5708 non-null   object 
 9   population         5708 non-null   object 
 10  habitat            5733 non-null   object 
 11  cap-color-rate     5737 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB


In [8]:
# Fill missing values by adding the mean for numeric variables and the mode for nominal variables.
meanImputer = SimpleImputer(strategy='mean')
modeImputer = SimpleImputer(strategy='most_frequent')

for col in df.columns:
	if df[col].dtype == 'object':
		df[[col]] = modeImputer.fit_transform(df[[col]])
	else:
		df[[col]] = meanImputer.fit_transform(df[[col]])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5764 non-null   object 
 3   bruises            5764 non-null   object 
 4   odor               5764 non-null   object 
 5   stalk-shape        5764 non-null   object 
 6   ring-number        5764 non-null   object 
 7   ring-type          5764 non-null   object 
 8   spore-print-color  5764 non-null   object 
 9   population         5764 non-null   object 
 10  habitat            5764 non-null   object 
 11  cap-color-rate     5764 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB


In [10]:
# Convert the label variable e (edible) to 1 and p (poisonous) to 0
df['label'] = df['label'].replace({'e':1, 'p':0})

df.head()

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,0,x,s,t,p,e,o,p,k,s,u,1.0
1,1,x,s,t,a,e,o,p,n,n,g,2.0
2,1,b,s,t,l,e,o,p,n,n,m,3.0
3,0,x,y,t,p,e,o,p,k,s,u,3.0
4,1,x,s,f,n,t,o,e,n,a,g,4.0


In [11]:
df['label'].value_counts()

label
0    3660
1    2104
Name: count, dtype: int64

In [12]:
# Convert the nominal variable to numeric using a dummy code with drop_first = True.
df = pd.get_dummies(df, drop_first=True)

df.head()

Unnamed: 0,label,cap-color-rate,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,bruises_t,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1.0,False,False,False,True,False,True,False,True,...,False,True,False,False,False,False,False,False,True,False
1,1,2.0,False,False,False,True,False,True,False,True,...,True,False,False,False,True,False,False,False,False,False
2,1,3.0,False,False,False,False,False,True,False,True,...,True,False,False,False,False,False,True,False,False,False
3,0,3.0,False,False,False,True,False,False,True,True,...,False,True,False,False,False,False,False,False,True,False
4,1,4.0,False,False,False,True,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   label                5764 non-null   int64  
 1   cap-color-rate       5764 non-null   float64
 2   cap-shape_c          5764 non-null   bool   
 3   cap-shape_f          5764 non-null   bool   
 4   cap-shape_k          5764 non-null   bool   
 5   cap-shape_x          5764 non-null   bool   
 6   cap-surface_g        5764 non-null   bool   
 7   cap-surface_s        5764 non-null   bool   
 8   cap-surface_y        5764 non-null   bool   
 9   bruises_t            5764 non-null   bool   
 10  odor_c               5764 non-null   bool   
 11  odor_f               5764 non-null   bool   
 12  odor_l               5764 non-null   bool   
 13  odor_m               5764 non-null   bool   
 14  odor_n               5764 non-null   bool   
 15  odor_p               5764 non-null   bool  

In [16]:
# split train/test with 20% test
train, test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=2020)

train.info()

test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4611 entries, 3526 to 2942
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   label                4611 non-null   int64  
 1   cap-color-rate       4611 non-null   float64
 2   cap-shape_c          4611 non-null   bool   
 3   cap-shape_f          4611 non-null   bool   
 4   cap-shape_k          4611 non-null   bool   
 5   cap-shape_x          4611 non-null   bool   
 6   cap-surface_g        4611 non-null   bool   
 7   cap-surface_s        4611 non-null   bool   
 8   cap-surface_y        4611 non-null   bool   
 9   bruises_t            4611 non-null   bool   
 10  odor_c               4611 non-null   bool   
 11  odor_f               4611 non-null   bool   
 12  odor_l               4611 non-null   bool   
 13  odor_m               4611 non-null   bool   
 14  odor_n               4611 non-null   bool   
 15  odor_p               4611 non-null   boo

In [17]:
print(train['label'].value_counts(normalize=True))

print(test['label'].value_counts(normalize=True))

label
0    0.635003
1    0.364997
Name: proportion, dtype: float64
label
0    0.634866
1    0.365134
Name: proportion, dtype: float64


In [18]:
# create random forest with grid search on training data with 5 CV
param_grid = {
	'criterion': ['gini', 'entropy'],
	'max_depth': [2, 3, 6],
	'min_samples_leaf': [2, 5, 10],
	'n_estimators': [100, 200],
	'random_state': [2020]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

grid_search.fit(train.drop(columns=['label']), train['label'])

grid_search.best_params_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


{'criterion': 'gini',
 'max_depth': 6,
 'min_samples_leaf': 2,
 'n_estimators': 100,
 'random_state': 2020}

In [19]:
# Predict the testing data set with confusion_matrix and classification_report.
y_pred = grid_search.predict(test.drop(columns=['label']))

print(confusion_matrix(test['label'], y_pred))

print(classification_report(test['label'], y_pred))

[[731   1]
 [  5 416]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       732
           1       1.00      0.99      0.99       421

    accuracy                           0.99      1153
   macro avg       1.00      0.99      0.99      1153
weighted avg       0.99      0.99      0.99      1153

