In [42]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/mydrive')


Drive already mounted at /content/mydrive; to attempt to forcibly remount, call drive.mount("/content/mydrive", force_remount=True).


In [67]:
# Importing all of the necessary libraries
import pandas as pd
import numpy as np


# Preprocessing and model evaluation libraries
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# Modeling libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


In [44]:
# Reading the data into a dataframe
df = pd.read_csv('/content/mydrive/MyDrive/Data - Science Projects/Mushroom - Poison/mushrooms-full-dataset.csv')

In [45]:
df.head() # First 5 rows of the data-set

Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [46]:
df.info() # Descriptive Analysis of the Data, All of the data is categorical

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   poisonous                 8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-surface-above-ring  8124 non-null   object
 12  stalk-surface-below-ring  8124 non-null   object
 13  stalk-color-above-ring    8124 non-null   object
 14  stalk-color-below-ring  

In [47]:
df.isna().sum() # Checking if the data has any null values

poisonous                   0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [48]:
x = df.shape # Shape of the data, i.e. number of columns and rows of the dataset
print("Data Frame contains ",x[0], " rows")
print("Data Frame conatins ",x[1], " columns")

Data Frame contains  8124  rows
Data Frame conatins  22  columns


In [49]:
df.describe().T

Unnamed: 0,count,unique,top,freq
poisonous,8124,2,e,4208
cap-shape,8124,6,x,3656
cap-surface,8124,4,y,3244
cap-color,8124,10,n,2284
bruises,8124,2,f,4748
odor,8124,9,n,3528
gill-attachment,8124,2,f,7914
gill-spacing,8124,2,c,6812
gill-size,8124,2,b,5612
gill-color,8124,12,b,1728


In [50]:
df['poisonous'].value_counts() # Target variable value counts

e    4208
p    3916
Name: poisonous, dtype: int64

In [51]:
#Splitting the data to target and input variables
y = df['poisonous']
X = df.drop(columns=['poisonous'], axis=1)

In [52]:
print("Shape of input Data", X.shape)
print("Shape of target Variable",y.shape)

Shape of input Data (8124, 21)
Shape of target Variable (8124,)


In [54]:
df.columns # name of all of the columns

Index(['poisonous', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'],
      dtype='object')

In [53]:
# Dividing Data to traing and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.3)

In [58]:
# Assigning Encoding Objects
LE = LabelEncoder()
OE = OrdinalEncoder()

In [59]:
# Encoding training and testing data
X_train_encoded = OE.fit_transform(X_train)
X_test_encoded = OE.transform(X_test)

y_train_encoded = LE.fit_transform(y_train)
y_test_encoded = LE.fit_transform(y_test)


In [63]:
# Scaling the input data
scaler = MinMaxScaler()


In [65]:

X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


# Modeling

1. Logistic Regression

In [69]:
LR = LogisticRegression()
LR.fit(X_train_scaled,y_train_encoded)
y_pred = LR.predict(X_test_scaled)

print("Accuracy for Logistic Regression : ", accuracy_score(y_test_encoded,y_pred))



Accuracy :  0.9511894995898277


2. Decision Tree Classifier

In [70]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train_scaled,y_train_encoded)
y_pred = DTC.predict(X_test_scaled)

print("Accuracy for Decision Tree Classifier : ", accuracy_score(y_test_encoded,y_pred))

Accuracy for Decision Tree Classifier :  1.0


3. Support Vector Classifier

In [72]:
SV = SVC(kernel='linear', C=1)
SV.fit(X_train_scaled,y_train_encoded)
y_pred = SV.predict(X_test_scaled)

print("Accuracy for Support Vector Classifier : ", accuracy_score(y_test_encoded,y_pred))



Accuracy for Support Vector Classifier :  0.9524200164068909
