# Introduction to classification

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt 


# Data Exploration

In [4]:
data = pd.read_csv('zoo.csv',index_col=0)
data.head()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal


In [16]:
data.isnull().sum()

hair        0
feathers    0
eggs        0
milk        0
airborne    0
aquatic     0
predator    0
toothed     0
backbone    0
breathes    0
venomous    0
fins        0
legs        0
tail        0
domestic    0
catsize     0
type        0
typeNum     0
dtype: int64

# After understanding data

        After reading the data we could understand that out target varable is categorical so , we are converting it into ordinal

In [6]:
data.type.unique()

array(['mammal', 'fish', 'bird', 'invertebrate', 'insect', 'amphibian',
       'reptile'], dtype=object)

In [15]:
data.type.value_counts()

mammal          41
bird            20
fish            13
invertebrate    10
insect           8
reptile          5
amphibian        4
Name: type, dtype: int64

# Data Preprocessing

In [77]:
data['typeNum'] = 0
data.typeNum    =   np.where(data['type']=='amphibian', 0,
                    np.where(data['type']=='bird', 1,
                    np.where(data['type']=='fish', 2,
                    np.where(data['type']=='insect', 3,
                    np.where(data['type']=='invertebrate', 4,
                    np.where(data['type']=='mammal', 5, 6))))))

In [72]:
data.typeNum.value_counts()

5    41
1    20
2    13
4    10
3     8
6     5
0     4
Name: typeNum, dtype: int64

In [20]:
independent = data.drop(['type', 'typeNum'],axis=1)

Input = independent.values
Output = data.typeNum.values

In [22]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(Input, Output, 
                                                    test_size=0.2, random_state=2, 
                                                    shuffle=True, stratify=data['typeNum'])

In [25]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='gini', splitter='best')
model.fit(x_train, y_train)

DecisionTreeClassifier()

In [26]:
TrainPred = model.predict(x_train)
TestPred = model.predict(x_test)


# Model Evaluation

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

        Training Accuracy

In [29]:
print(accuracy_score(y_train, TrainPred))

1.0


In [34]:
print(confusion_matrix(y_train, TrainPred))

[[ 3  0  0  0  0  0  0]
 [ 0 16  0  0  0  0  0]
 [ 0  0 10  0  0  0  0]
 [ 0  0  0  6  0  0  0]
 [ 0  0  0  0  8  0  0]
 [ 0  0  0  0  0 33  0]
 [ 0  0  0  0  0  0  4]]


        Testing Accuracy

In [36]:
print(accuracy_score(y_test, TestPred))

0.9523809523809523


In [37]:
print(confusion_matrix(y_test, TestPred))

[[1 0 0 0 0 0 0]
 [0 4 0 0 0 0 0]
 [0 0 3 0 0 0 0]
 [0 0 0 1 1 0 0]
 [0 0 0 0 2 0 0]
 [0 0 0 0 0 8 0]
 [0 0 0 0 0 0 1]]


# Saving the result in CSV

In [40]:
header = data.drop(['type','typeNum'], axis=1).columns.values
header

array(['hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic',
       'predator', 'toothed', 'backbone', 'breathes', 'venomous', 'fins',
       'legs', 'tail', 'domestic', 'catsize'], dtype=object)

In [50]:
result = pd.concat([pd.DataFrame(x_test, columns=header), pd.Series(y_test, name='type'), pd.Series(TestPred, name='pred_type')], axis=1)
result.to_csv('zoo-predict.csv', index_label='SNo')

In [51]:
!ls

lesson_01.ipynb zoo-predict.csv zoo.csv


In [52]:
res = pd.read_csv('zoo-predict.csv')
res.head()

Unnamed: 0,SNo,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type,pred_type
0,0,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,2,2
1,1,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,0,6,6
2,2,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,1,1
3,3,1,0,0,1,0,1,1,1,1,1,0,1,0,0,0,1,5,5
4,4,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,1,1


# Saving the model and PDF

In [67]:
from sklearn import tree
import graphviz

# data.typeNum    =   np.where(data['type']=='amphibian', 0,
#                     np.where(data['type']=='bird', 1,
#                     np.where(data['type']=='fish', 2,
#                     np.where(data['type']=='insect', 3,
#                     np.where(data['type']=='invertebrate', 4,
#                     np.where(data['type']=='mammal', 5, 6))))))

saved = tree.export_graphviz(model, out_file=None, feature_names=header, 
                                                    class_names=['amphibian', 'bird', 'fish', 'insect', 'invertebrate', 'mammal', 'reptile'], 
                                                    filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(saved)
graph.render("zoo_model")

'zoo_model.pdf'