## Problem Statement :  Predict the survival of a horse based on various observed medical conditions. Load the data from “horses.csv” and observe whether it contains missing values. The dataset contains many categorical features; replace them with label encoding. Replace the missing values by the most frequent value in each column. Fit a decision tree classifier and random forest classifier, and observe the accuracy.

#### Objective :  Learn to fit a decision tree, and compare its accuracy with random forest classifier. 


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv('horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [4]:
df.shape

(299, 28)

In [5]:
df.columns

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2',
       'lesion_3', 'cp_data'],
      dtype='object')

In [6]:
df.dtypes

surgery                   object
age                       object
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
outcome                   object
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
dtype: object

In [9]:
df.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [10]:
target=df['outcome']
target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [11]:
df=df.drop(['outcome'],axis=1)

In [14]:
# as we categorical dont work with decison tree model so we create dummies
cat_var=['surgery', 'age',  'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
        'rectal_exam_feces', 'abdomen',
        'abdomo_appearance',
        'surgical_lesion', 
        'cp_data']
for i in cat_var:
    df[i]=pd.get_dummies(df[i])
    
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,0,0,530101,38.5,66.0,28.0,1,1,1,1,...,0,45.0,8.4,1,,0,11300,0,0,0
1,1,0,534817,39.2,88.0,20.0,1,1,1,1,...,1,50.0,85.0,1,2.0,0,2208,0,0,0
2,0,0,530334,38.3,40.0,24.0,1,1,1,1,...,1,33.0,6.7,1,,0,0,0,0,1
3,1,1,5290409,39.1,164.0,84.0,0,1,1,1,...,1,48.0,7.2,1,5.3,1,2208,0,0,1
4,0,0,530255,37.3,104.0,35.0,1,1,1,1,...,1,74.0,7.4,1,,0,4300,0,0,0


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
x,y=df.values,target.values
lc=LabelEncoder()
y=lc.fit_transform(y)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

In [16]:
from sklearn.tree import DecisionTreeClassifier
print(x_train.shape)

(209, 27)


In [17]:
# imputing or filling most frequent values with missing nan values
from sklearn.impute import SimpleImputer
fl=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
x_train=fl.fit_transform(x_train)
x_test=fl.fit_transform(x_test)

In [19]:
classifier=DecisionTreeClassifier()

In [20]:
classifier.fit(x_train,y_train)

DecisionTreeClassifier()

In [21]:
y_pred=classifier.predict(x_test)

In [23]:
from sklearn.metrics import accuracy_score
ac=accuracy_score(y_pred,y_test)
print(ac)

0.6555555555555556


In [24]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_pred,y_test)
print(cm)# we had three outcomes in target varaible

[[16  1 14]
 [ 2  6  2]
 [10  2 37]]


In [25]:
from sklearn.ensemble import RandomForestClassifier
classifier2=RandomForestClassifier()

In [26]:
classifier2.fit(x_train,y_train)

RandomForestClassifier()

In [29]:
y_pred2=classifier2.predict(x_test)

In [30]:
from sklearn.metrics import accuracy_score
ac2=accuracy_score(y_pred2,y_test)
print(ac2)

0.7222222222222222


In [31]:
from sklearn.metrics import confusion_matrix
cm2=confusion_matrix(y_pred2,y_test)
print(cm2)

[[13  0  2]
 [ 1  1  0]
 [14  8 51]]
