### Mushroom Classification using SVM

#### Based on the features on this dataset, we are predicting whether a mushroom type is edible or poisonous

#### No. of features: 22
#### No. of rows: 8124

#### Class Label: e (edible) or p (poisonous)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [7]:
data = pd.read_csv('mushrooms.csv')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class                       8124 non-null object
cap-shape                   8124 non-null object
cap-surface                 8124 non-null object
cap-color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill-attachment             8124 non-null object
gill-spacing                8124 non-null object
gill-size                   8124 non-null object
gill-color                  8124 non-null object
stalk-shape                 8124 non-null object
stalk-root                  8124 non-null object
stalk-surface-above-ring    8124 non-null object
stalk-surface-below-ring    8124 non-null object
stalk-color-above-ring      8124 non-null object
stalk-color-below-ring      8124 non-null object
veil-type                   8124 non-null object
veil-color                  8124 non-null object
ring-number

In [9]:
data.isnull().any()

class                       False
cap-shape                   False
cap-surface                 False
cap-color                   False
bruises                     False
odor                        False
gill-attachment             False
gill-spacing                False
gill-size                   False
gill-color                  False
stalk-shape                 False
stalk-root                  False
stalk-surface-above-ring    False
stalk-surface-below-ring    False
stalk-color-above-ring      False
stalk-color-below-ring      False
veil-type                   False
veil-color                  False
ring-number                 False
ring-type                   False
spore-print-color           False
population                  False
habitat                     False
dtype: bool

#### We will convert our categorical features into numerical features using LabelEncoder

In [10]:
from sklearn.svm import SVC 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split

In [11]:
le = LabelEncoder()
data_tf = data.copy()
for i in data.columns:
    data_tf[i]=le.fit_transform(data[i])

In [12]:
features = data_tf.drop(['class'], axis=1)
label = data_tf['class']

#### Initially we try classification using linear kernel

In [13]:
clf = SVC(kernel='linear') 

In [14]:
label_new = label.values

feature_train,feature_test,label_train,label_test = train_test_split(features,label_new,test_size=0.7,random_state=20)

clf.fit(feature_train,label_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
prediction = clf.predict(feature_test)

In [16]:
accuracyscore = accuracy_score(prediction,label_test)

In [17]:
accuracyscore

0.9590293652189203

In [18]:
cm = confusion_matrix(prediction,label_test)

In [19]:
cm

array([[2805,  122],
       [ 111, 2649]], dtype=int64)

In [20]:
cr = classification_report(prediction,label_test)

In [22]:
print(cr)

             precision    recall  f1-score   support

          0       0.96      0.96      0.96      2927
          1       0.96      0.96      0.96      2760

avg / total       0.96      0.96      0.96      5687



#### Accuracy using linear kernel = 96%

#### Now, Let's try classification using rbf kernel

In [23]:
clf = SVC(kernel='rbf') 

In [24]:
label_new = label.values

feature_train,feature_test,label_train,label_test = train_test_split(features,label_new,test_size=0.7,random_state=20)

clf.fit(feature_train,label_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
prediction = clf.predict(feature_test)

In [26]:
accuracyscore = accuracy_score(prediction,label_test)

In [27]:
accuracyscore

0.9985932829259715

#### Accuracy using linear kernel = 99%

In [28]:
cm = confusion_matrix(prediction,label_test)

In [29]:
cm

array([[2913,    5],
       [   3, 2766]], dtype=int64)

In [30]:
cr = classification_report(prediction,label_test)

In [31]:
print(cr)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2918
          1       1.00      1.00      1.00      2769

avg / total       1.00      1.00      1.00      5687

