## Notebook

Data credit: https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset

Secondary Mushroom Dataset by Dennis Wagner, D. Heider, and Georges Hattab. 2021
Courtesy of UC Irvine

In [1]:
# temporary code block for static csv
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
df = pd.read_csv("secondary_data.csv", delimiter=';')

In [None]:
# import dependencies
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sqlalchemy import create_engine
import pandas as pd

In [None]:
# run sql engine and query data
dataset = 'sql dataset file path goes here'
engine = create_engine(f"sqlite:///{dataset}")
conn = engine.connect()

df = pd.read_sql("SELECT * FROM table", conn)

In [2]:
df # view the dataset

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.60,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.80,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,s,y,f,f,f,f,3.93,...,,,y,,,f,f,,d,a
61065,p,1.27,f,s,y,f,f,f,f,3.18,...,,,y,,,f,f,,d,a
61066,p,1.27,s,s,y,f,f,f,f,3.86,...,,,y,,,f,f,,d,u
61067,p,1.24,f,s,y,f,f,f,f,3.56,...,,,y,,,f,f,,d,u


In [3]:
# creating a summary table showing each column, its data type, number of unique values, and missing values
summary_data = [(x, df[x].dtype,
                 len(df[x].unique()),
                 len(df[df[x].isna()])) for x in df.columns]

summary_table = pd.DataFrame(summary_data, columns=['Column','Data Type',
                                                    'Unique Values', 'Missing Values'])
summary_table['Percentage'] = round(summary_table['Missing Values'] / len(df), 2)

summary_table.sort_values('Missing Values', ascending=False)

Unnamed: 0,Column,Data Type,Unique Values,Missing Values,Percentage
14,veil-type,object,2,57892,0.95
18,spore-print-color,object,8,54715,0.9
15,veil-color,object,7,53656,0.88
11,stem-root,object,6,51538,0.84
12,stem-surface,object,9,38124,0.62
7,gill-spacing,object,4,25063,0.41
3,cap-surface,object,12,14120,0.23
6,gill-attachment,object,8,9884,0.16
17,ring-type,object,9,2471,0.04
0,class,object,2,0,0.0


In [4]:
# dropping columns with excessive missing values (30,000 or more)
df = df.drop(columns=['stem-root', 'stem-surface',
                      'veil-type', 'veil-color',
                      'spore-print-color'], axis=1)

# dropping remaining rows with na values
df = df.dropna()

In [5]:
# separating the target variable
X = df.drop(columns='class')
X = X.reset_index(drop=True)

# separating categorical from numerical features
X_categorical = X.select_dtypes(include='O')
X_numeric = X.select_dtypes(exclude='O')

# scaling numerical features
X_numeric_scaled = StandardScaler().fit_transform(X_numeric)
X_numeric_scaled = pd.DataFrame(X_numeric_scaled, columns=X_numeric.columns)

# getting dummies of the categorical features
dummies = pd.get_dummies(X_categorical, dtype=int)

# concatenating dummies to the numeric columns
X_clean = pd.concat([dummies, X_numeric_scaled], axis=1)

In [29]:
# Instantiate PCA and fit the model
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_clean)

X_pca[:5]

array([[ 1.52215955,  2.67621844, -0.20309788, -0.06865332,  0.67264843],
       [ 1.82588878,  2.90185199, -0.25965988, -0.16814456,  0.54183091],
       [ 2.20352222,  2.88626295, -0.10225533, -0.44953597,  0.45155455],
       [ 1.78727167,  2.58686349, -0.09509045, -0.36553046,  0.45029519],
       [ 1.90265996,  2.78831312, -0.11170066, -0.37999532,  0.4837939 ]])

In [30]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.20582689, 0.09903994, 0.05742793, 0.05001076, 0.04627044])

In [31]:
# splitting training and testing for regular dataset
X_train, X_test, y_train, y_test = train_test_split(X_clean, df['class'])

# splitting training and testing for pca dataset
X_pca_train, X_pca_test, y_train, y_test = train_test_split(X_pca, df['class'])