In [62]:
import pandas as pd
import numpy as np

#1. Load "ModifiedEdibleMushroom.csv" data from the link below (note: this data set has been preliminarily prepared.).

In [63]:
data = pd.read_csv('https://raw.githubusercontent.com/pvateekul/2110446_DSDE_2023s2/main/code/Week03_ML/mushroom2020_dataset.csv')

In [64]:
data.head()

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0


In [65]:
data.isnull().sum()

id                               0
label                           60
cap-shape                        0
cap-surface                     27
bruises                         99
odor                            99
gill-attachment                 99
gill-spacing                   130
gill-size                      121
stalk-shape                    121
stalk-root                      31
stalk-surface-above-ring        31
stalk-surface-below-ring        31
veil-type                       62
ring-number                     62
ring-type                       62
spore-print-color               56
population                      56
habitat                         31
cap-color-rate                  27
gill-color-rate                121
veil-color-rate                 62
stalk-color-above-ring-rate     31
stalk-color-below-ring-rate     62
dtype: int64

# 2. Drop rows where the target (label) variable is missing.

In [66]:
data.dropna(subset=['label'], inplace=True)

# 3. Drop the following variables

In [67]:
data.drop(['id', 'gill-attachment', 'gill-spacing', 'gill-size' ,'gill-color-rate', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate', 'stalk-color-below-ring-rate', 'veil-color-rate', 'veil-type'], axis=1, inplace=True)

In [68]:
data.columns

Index(['label', 'cap-shape', 'cap-surface', 'bruises', 'odor', 'stalk-shape',
       'ring-number', 'ring-type', 'spore-print-color', 'population',
       'habitat', 'cap-color-rate'],
      dtype='object')

# 4. Examine the number of rows, the number of digits, and whether any are missing.

In [69]:
data.shape

(5764, 12)

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5737 non-null   object 
 3   bruises            5665 non-null   object 
 4   odor               5665 non-null   object 
 5   stalk-shape        5643 non-null   object 
 6   ring-number        5702 non-null   object 
 7   ring-type          5702 non-null   object 
 8   spore-print-color  5708 non-null   object 
 9   population         5708 non-null   object 
 10  habitat            5733 non-null   object 
 11  cap-color-rate     5737 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB


# 5. Fill missing values by adding the mean for numeric variables and the mode for nominal variables.

In [71]:
# numeric variables
data['cap-color-rate'] = data['cap-color-rate'].fillna(data['cap-color-rate'].mean())

In [72]:
# nominal variables
mode_values = data.mode().iloc[0]
data = data.fillna(mode_values)

In [73]:
data.isnull().sum()

label                0
cap-shape            0
cap-surface          0
bruises              0
odor                 0
stalk-shape          0
ring-number          0
ring-type            0
spore-print-color    0
population           0
habitat              0
cap-color-rate       0
dtype: int64

# 6. Convert the label variable e (edible) to 1 and p (poisonous) to 0 and check the quantity. class0: class1

In [74]:
def convert_label(val):
  if val == 'e':
    return 1
  else:
    return 0

In [75]:
data['label'] = data['label'].apply(convert_label)

In [76]:
data.label.value_counts()

label
0    3660
1    2104
Name: count, dtype: int64

# 7. Convert the nominal variable to numeric using a dummy code with drop_first = True.

In [77]:
data

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,0,x,s,t,p,e,o,p,k,s,u,1.0
1,1,x,s,t,a,e,o,p,n,n,g,2.0
2,1,b,s,t,l,e,o,p,n,n,m,3.0
3,0,x,y,t,p,e,o,p,k,s,u,3.0
4,1,x,s,f,n,t,o,e,n,a,g,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5819,1,k,s,f,n,e,o,p,b,c,l,1.0
5820,1,x,s,f,n,e,o,p,b,v,l,1.0
5821,1,f,s,f,n,e,o,p,b,c,l,1.0
5822,0,k,y,f,y,t,o,e,w,v,l,1.0


In [78]:
selected_columns = data.iloc[:, ~data.columns.isin(['label', 'cap-color-rate'])]

In [79]:
dummy_data = pd.get_dummies(data[selected_columns.columns], drop_first=True)
data_with_dummy = pd.concat([data, dummy_data], axis=1)
data = data_with_dummy.drop(selected_columns.columns, axis=1)
print(data.shape)

(5764, 43)


In [80]:
data

Unnamed: 0,label,cap-color-rate,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,bruises_t,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1.0,False,False,False,True,False,True,False,True,...,False,True,False,False,False,False,False,False,True,False
1,1,2.0,False,False,False,True,False,True,False,True,...,True,False,False,False,True,False,False,False,False,False
2,1,3.0,False,False,False,False,False,True,False,True,...,True,False,False,False,False,False,True,False,False,False
3,0,3.0,False,False,False,True,False,False,True,True,...,False,True,False,False,False,False,False,False,True,False
4,1,4.0,False,False,False,True,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819,1,1.0,False,False,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
5820,1,1.0,False,False,False,True,False,True,False,False,...,False,False,True,False,False,True,False,False,False,False
5821,1,1.0,False,True,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
5822,0,1.0,False,False,True,False,False,False,True,False,...,False,False,True,False,False,True,False,False,False,False


In [81]:
data.columns

Index(['label', 'cap-color-rate', 'cap-shape_c', 'cap-shape_f', 'cap-shape_k',
       'cap-shape_x', 'cap-surface_g', 'cap-surface_s', 'cap-surface_y',
       'bruises_t', 'odor_c', 'odor_f', 'odor_l', 'odor_m', 'odor_n', 'odor_p',
       'odor_s', 'odor_y', 'stalk-shape_t', 'ring-number_o', 'ring-number_t',
       'ring-type_f', 'ring-type_l', 'ring-type_n', 'ring-type_p',
       'spore-print-color_h', 'spore-print-color_k', 'spore-print-color_n',
       'spore-print-color_o', 'spore-print-color_r', 'spore-print-color_w',
       'spore-print-color_y', 'population_c', 'population_n', 'population_s',
       'population_v', 'population_y', 'habitat_g', 'habitat_l', 'habitat_m',
       'habitat_p', 'habitat_u', 'habitat_w'],
      dtype='object')

# 8. Split train/test with 20% test, stratify, and seed = 2020.

In [82]:
from sklearn.model_selection import train_test_split
y = data.pop('label')
X = data

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=2020)

In [83]:
len(X_train)

4611

In [84]:
len(X_test)

1153

# 9. Create a Random Forest with GridSearch on training data with 5 CV.

In [85]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

random_forest = RandomForestClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 6],
    'min_samples_leaf': [2, 5, 10],
    'n_estimators': [100, 200],
    'random_state': [2020]
}

grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5)

# fit the model to the training data
grid_search.fit(X_train, y_train)

# get the parameters
best_params = grid_search.best_params_
print(best_params)

{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 2, 'n_estimators': 100, 'random_state': 2020}


# 10. Predict the testing data set with confusion_matrix and classification_report.

In [86]:
from sklearn.metrics import classification_report,confusion_matrix

# use the best parameters with the rf model
best_rf = RandomForestClassifier(**best_params)

# fit the model with the best param
best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_test)
print(classification_report(y_test,y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.9932    0.9986    0.9959       732
           1     0.9976    0.9881    0.9928       421

    accuracy                         0.9948      1153
   macro avg     0.9954    0.9934    0.9944      1153
weighted avg     0.9948    0.9948    0.9948      1153



In [87]:
print(confusion_matrix(y_test,y_pred,labels=[0,1]))

[[731   1]
 [  5 416]]
