In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from ucimlrepo import fetch_ucirepo 

In [4]:
# Fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# Save data as X and y variables
X = mushroom.data.features 
y = np.ravel(mushroom.data.targets)

# Expand dataframe columns and look at view dataframe
pd.set_option('display.max_columns', None)
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


## Encoding Data

In [7]:
# Factorize all columns
for col in X.columns:
    X.loc[:,col] = pd.factorize(X[col], sort = True)[0]# locate all the rows 

# View first few rows of encoded data
x.iloc[0:5, 0:5]

Time to Impute!

In [8]:
# Check for NAs
X.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [10]:
# Create copy of X variables
X_Na = X.copy()

To see which imputation methods perform best by comparing their results to actual dataset

In [11]:
# Randomly assigns 10% of new dataframe with NA values
for col in X_Na.columns:
    # for rows that you select putting na values in 
    X_Na.loc[X_Na.sample(frac = 0.1).index, col] = np.nan


In [14]:
X_Na.isna().sum()

cap-shape                   812
cap-surface                 812
cap-color                   812
bruises                     812
odor                        812
gill-attachment             812
gill-spacing                812
gill-size                   812
gill-color                  812
stalk-shape                 812
stalk-root                  812
stalk-surface-above-ring    812
stalk-surface-below-ring    812
stalk-color-above-ring      812
stalk-color-below-ring      812
veil-type                   812
veil-color                  812
ring-number                 812
ring-type                   812
spore-print-color           812
population                  812
habitat                     812
dtype: int64

Now that we have NAs we want to impute

### Impution method #1: Filling NA values with the mode of each column

In [16]:
# Impute with mode
X_mode_impute = X_Na.fillna(X_Na.mode().iloc[0])

# Check to make sure there are no NAs
X_mode_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

### Impution method #2: Filling NA values with the median of each column

In [19]:
# ---- Impute with the median ----

# Initialize the imputer
median_impute = SimpleImputer(strategy = 'median')
X_median_impute = median_impute.fit_transform(X_Na)
X_median_impute = pd.DataFrame(X_median_impute, columns = X.columns)

# Check to make sure there are no NAs
X_median_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [22]:
# ---- Impute with KNN, majority class of neighbors ----

# Initialize the imputer
knn_impute = KNNImputer(n_neighbors = 5)
X_knn_impute = knn_impute.fit_transform(X_Na)
X_knn_impute = pd.DataFrame(X_knn_impute, columns = X.columns)

# Check to make sure that there are no NAs
X_knn_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

### Lets see which one of these performed best using MSE

In [23]:
# --- Use MSE for model performance ----
mse_mode = mean_squared_error(X, X_mode_impute)
mse_median = mean_squared_error(X, X_median_impute)
mse_knn = mean_squared_error(X, X_knn_impute)

print("Mode Performance: ", mse_mode)
print("Median Performance: ", mse_median)
print("KNN Performance: ", mse_knn)

Mode Performance:  0.44865941542455573
Median Performance:  0.2537710935052146
KNN Performance:  0.12202609551944855


### Bagging classifier with og data

In [24]:
# Split actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [27]:
# Number of features to include for tuning
num_features = [1,4,7,10,13,16,19,22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(
        n_estimators = 50,
        max_depth = 3,
        random_state = 42,
        max_features = feature
    )

    rf_classifier.fit(X_train, y_train)
    
    # Predict and evaluate results
    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)
    
    print("Number of features: ", feature, "Random Forest Accuracy: ", rf_accuracy)

Number of features:  1 Random Forest Accuracy:  0.916735028712059
Number of features:  4 Random Forest Accuracy:  0.9848236259228876
Number of features:  7 Random Forest Accuracy:  0.9868744872846595
Number of features:  10 Random Forest Accuracy:  0.9835931091058244
Number of features:  13 Random Forest Accuracy:  0.9823625922887613
Number of features:  16 Random Forest Accuracy:  0.9860541427399507
Number of features:  19 Random Forest Accuracy:  0.9819524200164069
Number of features:  22 Random Forest Accuracy:  0.9577522559474979


In [32]:
# Split imputed data
X_train, X_Test, y_train, y_test = train_test_split(X_mode_impute, y, test_size = 0.3, random_state = 42)

# Number of features to include for tuning
num_features = [1,4,7,10,13,16,19,22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(
        n_estimators = 50,
        max_depth = 3,
        random_state = 42,
        max_features = feature
    )

    rf_classifier.fit(X_train, y_train)
    
    # Predict and evaluate results
    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)
    
    print("Number of features: ", feature, "Random Forest Accuracy: ", rf_accuracy)

Number of features:  1 Random Forest Accuracy:  0.916735028712059
Number of features:  4 Random Forest Accuracy:  0.9848236259228876
Number of features:  7 Random Forest Accuracy:  0.9827727645611156
Number of features:  10 Random Forest Accuracy:  0.9811320754716981
Number of features:  13 Random Forest Accuracy:  0.9762100082034455
Number of features:  16 Random Forest Accuracy:  0.9770303527481542
Number of features:  19 Random Forest Accuracy:  0.9622641509433962
Number of features:  22 Random Forest Accuracy:  0.9425758818703855
