#  The Mushroom Classification Project

 The goal of this dataset is to classify between edible (e) to poisonous (p) mushrooms.

Let's first import the libraries we need, and import the data

In [1]:
from pandas import read_csv, get_dummies, Series
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot

"""Importing the Classification Algorithms"""

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

filename = 'mushrooms.csv'
Dataset = read_csv(filename)

The list and types of the features is:

In [2]:
Types = Dataset.dtypes
Values_Table = Dataset.count()

In [3]:
print(Types)

class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object


In [4]:
print(Values_Table)

class                       8124
cap-shape                   8124
cap-surface                 8124
cap-color                   8124
bruises                     8124
odor                        8124
gill-attachment             8124
gill-spacing                8124
gill-size                   8124
gill-color                  8124
stalk-shape                 8124
stalk-root                  8124
stalk-surface-above-ring    8124
stalk-surface-below-ring    8124
stalk-color-above-ring      8124
stalk-color-below-ring      8124
veil-type                   8124
veil-color                  8124
ring-number                 8124
ring-type                   8124
spore-print-color           8124
population                  8124
habitat                     8124
dtype: int64


Checking if there are any missing values

In [5]:
Missing_Values_Table = Dataset.isnull().sum() 
print(Missing_Values_Table)

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


Looks like nothing is missing

## The Encoding Part

All the data is categorical, so encoding is required

In [6]:
encoder = LabelEncoder()

I'm going to put the class column in a temporary variable and drop it

In [7]:
temp = Dataset['class']
Dataset = Dataset.drop(['class'], axis=1)

And then I'm going to apply the hot encoder

In [8]:
Dataset = get_dummies(Dataset)

and then reinsert he class colum because this one requires a LabelEncoder

In [9]:
Dataset['class']=temp
Dataset['class']=encoder.fit_transform(Dataset['class'])

## The machine learning Part

In [10]:
array = Dataset.values
X = array[:,0:-1]
Y = array[:,-1]

validation_size = 0.2
seed = 7

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,
        test_size=validation_size, random_state=seed)

In [11]:
# prepare models
models = []
models.append(('LR', LogisticRegression(solver = 'liblinear')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma = 'auto')))

# Evaluate each model in turn

results = []
names = []
n_folds = 10
scoring = 'accuracy'
for name,model in models:
    kfold = KFold(n_splits = n_folds, random_state = seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {cv_results.mean()}, {cv_results.std()}')

LR: 1.0, 0.0
KNN: 1.0, 0.0
CART: 1.0, 0.0
NB: 0.9556840109043498, 0.008868151093461476
SVM: 0.9978459167950694, 0.0010203572261229083


In [12]:
# prepare the model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train, Y_train)
# estimate accuracy on validation dataset
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

print((model.coef_).shape)

1.0
[[840   0]
 [  0 785]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       840
           1       1.00      1.00      1.00       785

   micro avg       1.00      1.00      1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

(1, 117)
