# "{Title}"

- toc: true
- badges: true
- comments: true

## Introduction

## Setup

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, precision_score, recall_score, accuracy_score, f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
file_dir = "/kaggle/input/mushroom-classification/mushrooms.csv"

In [None]:
!ls -lh {file_dir}

In [None]:
!head {file_dir}

**Observations**

* The dataset file size is 366 KB.
* It will be safe to import the whole dataset.
* The prediction class is the first column.
* There appears to be no index column

In [None]:
df = pd.read_csv(file_dir)

# view all columns
pd.set_option("display.max_columns", None)

df.head()

## EDA and Data Preparation

In [None]:
X = df.copy()
y = X.pop("class")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# prediction classes and their distribution

class_vc = df["class"].value_counts()
class_vc

In [None]:
sns.barplot(x = class_vc.index, y = class_vc)

The class counts are not much imbalanced.

Distribution of features

In [None]:
for i, cols in enumerate(df):
    feature_vc = df[cols].value_counts()
    print(feature_vc, "\n_________\n")
    
    plt.figure(i)
    sns.barplot(x = feature_vc.index, y = feature_vc)

In [None]:
df["veil-type"].unique()

In [None]:
df.isnull().sum()

We also didn't need stratification."HistGradientBoosterClassifier": HistGradientBoostingClassifier(learning_rate = 0.05, random_state = 42, scoring=)

## Train Baseline Model and Evaluate Results

In [None]:
ohe = OneHotEncoder(drop = "first", handle_unknown = "ignore", sparse = False)
ohe_train_data = ohe.fit_transform(X_train)

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)

In [None]:
base_model = DummyClassifier(random_state = 42, strategy = "most_frequent")
base_cross_val = cross_validate(base_model, ohe_train_data, y_train_le, scoring = ["accuracy", "precision", "recall", "f1", "roc_auc"])

In [None]:
base_cross_val

## Model Selection

In [None]:
models = {"LogisticRegression": LogisticRegression(random_state = 42),
         "RidgeClassification": RidgeClassifier(random_state = 42),
         "GaussianNB": GaussianNB(),
         "RandomForestClassifier": RandomForestClassifier(n_estimators = 70, random_state = 42),
         "XGBClassifier": XGBClassifier(n_estimators = 70, objective = "binary:logistic", learning_rate = 0.05, n_jobs = -1, scoring = "auc", random_state = 42)}

In [None]:
model_scores = {}

for model_name, model in models.items():
    
    cross_val = cross_validate(model, ohe_train_data, y_train_le, n_jobs = -1, scoring = ["accuracy", "precision", "recall", "f1", "roc_auc"])
    del cross_val["fit_time"]
    del cross_val["score_time"]
    
    model_scores[model_name] = cross_val

In [None]:
model_scores_df = pd.DataFrame.from_dict(model_scores)
for col in model_scores_df:
    model_scores_df[col] = model_scores_df[col].mean()

In [None]:
model_scores_df

## Train Final Model and make predictions

In [None]:
forest_clas = RandomForestClassifier(random_state = 42)

ohe = OneHotEncoder(drop = "first", handle_unknown = "ignore", sparse = False)
forest_pipe = make_pipeline(ohe, forest_clas)

forest_pipe.fit(X_train, y_train_le)

In [None]:
preds = forest_pipe.predict(X_test)
preds_in = le.inverse_transform(preds)
preds_in

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, preds_in)

In [None]:
accuracy_score(y_test, preds_in)

In [None]:
recall_score(y_test_le, preds)

In [None]:
f1_score(y_test_le, preds)

In [None]:
RocCurveDisplay.from_estimator(forest_pipe, X_test, y_test_le)

## Summary and Conclusion