## 1. Load "ModifiedEdibleMushroom.csv" data from the link below (note: this data set has been preliminarily prepared.).

In [1]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
df = pd.read_csv('mushroom2020_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0


In [4]:
df.isnull().sum()

id                               0
label                           60
cap-shape                        0
cap-surface                     27
bruises                         99
odor                            99
gill-attachment                 99
gill-spacing                   130
gill-size                      121
stalk-shape                    121
stalk-root                      31
stalk-surface-above-ring        31
stalk-surface-below-ring        31
veil-type                       62
ring-number                     62
ring-type                       62
spore-print-color               56
population                      56
habitat                         31
cap-color-rate                  27
gill-color-rate                121
veil-color-rate                 62
stalk-color-above-ring-rate     31
stalk-color-below-ring-rate     62
dtype: int64

## 2. Drop rows where the target (label) variable is missing.

In [5]:
df.dropna(subset=["label"], inplace=True)

In [6]:
df.isnull().sum()

id                               0
label                            0
cap-shape                        0
cap-surface                     27
bruises                         99
odor                            99
gill-attachment                 99
gill-spacing                   130
gill-size                      121
stalk-shape                    121
stalk-root                      31
stalk-surface-above-ring        31
stalk-surface-below-ring        31
veil-type                       62
ring-number                     62
ring-type                       62
spore-print-color               56
population                      56
habitat                         31
cap-color-rate                  27
gill-color-rate                121
veil-color-rate                 62
stalk-color-above-ring-rate     31
stalk-color-below-ring-rate     62
dtype: int64

## 3. Drop the following variables:

In [7]:
drop_cols = ['id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate','stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type']
df.drop(columns=drop_cols, axis=1, inplace=True)

## 4. Examine the number of rows, the number of digits, and whether any are missing.

In [8]:
df.isnull().sum()

label                  0
cap-shape              0
cap-surface           27
bruises               99
odor                  99
stalk-shape          121
ring-number           62
ring-type             62
spore-print-color     56
population            56
habitat               31
cap-color-rate        27
dtype: int64

In [9]:
df.shape

(5764, 12)

## 5. Fill missing values by adding the mean for numeric variables and the mode for nominal variables.



In [10]:
df['cap-color-rate'] = df['cap-color-rate'].fillna(df['cap-color-rate'].mean())

In [11]:
df = df.fillna(df.mode().iloc[0])

In [12]:
df.isnull().sum()

label                0
cap-shape            0
cap-surface          0
bruises              0
odor                 0
stalk-shape          0
ring-number          0
ring-type            0
spore-print-color    0
population           0
habitat              0
cap-color-rate       0
dtype: int64

## 6. Convert the label variable e (edible) to 1 and p (poisonous) to 0 and check the quantity. class0: class1

In [13]:
df.loc[df['label'] == 'e', 'label'] = 1
df.loc[df['label'] == 'p', 'label'] = 0

In [14]:
df.label.value_counts()

0    3660
1    2104
Name: label, dtype: int64

## 7. Convert the nominal variable to numeric using a dummy code with drop_first = True.

In [15]:
selected_columns = df.iloc[:, ~df.columns.isin(['label', 'cap-color-rate'])]

In [16]:
dummy_df = pd.get_dummies(df[selected_columns.columns], drop_first=True)
df = df.drop(columns=selected_columns.columns, axis=1)
df = pd.concat([df,dummy_df], axis=1)
df

Unnamed: 0,label,cap-color-rate,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,bruises_t,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1.0,0,0,0,1,0,1,0,1,...,0,1,0,0,0,0,0,0,1,0
1,1,2.0,0,0,0,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,0
2,1,3.0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,1,0,0,0
3,0,3.0,0,0,0,1,0,0,1,1,...,0,1,0,0,0,0,0,0,1,0
4,1,4.0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819,1,1.0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5820,1,1.0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
5821,1,1.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5822,0,1.0,0,0,1,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


## 8. Split train/test with 20% test, stratify, and seed = 2020.

In [17]:
from sklearn.model_selection import train_test_split
y = df.pop('label')
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=2020)

## 9. Create a Random Forest with GridSearch on training data with 5 CV.

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

random_forest = RandomForestClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 6],
    'min_samples_leaf': [2, 5, 10],
    'n_estimators': [100, 200],
    'random_state': [2020]
}

grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5)

# fit the model to the training data
grid_search.fit(X_train, y_train)

# get the parameters
best_params = grid_search.best_params_
print(best_params)

180 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 367, in fit
    y, expanded_class_weight = self._validate_y_class_weight(y)
  File "/opt/homebrew/Caskroom/miniconda/base/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 734, in _validate_y_class_weight
    check_classification_targets(y)
  File "/opt/homebrew/Caskr

ValueError: Unknown label type: 'unknown'

## 10. Predict the testing data set with confusion_matrix and classification_report.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

best_rf = RandomForestClassifier(**best_params)

best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

NameError: name 'best_params' is not defined