In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df1 = pd.read_csv('./Train.csv')

In [3]:
df1.head(n=5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
pclass       1009 non-null float64
survived     1009 non-null float64
name         1009 non-null object
sex          1009 non-null object
age          812 non-null float64
sibsp        1009 non-null float64
parch        1009 non-null float64
ticket       1009 non-null object
fare         1008 non-null float64
cabin        229 non-null object
embarked     1008 non-null object
boat         374 non-null object
body         98 non-null float64
home.dest    582 non-null object
dtypes: float64(7), object(7)
memory usage: 110.4+ KB


In [5]:
useful_cols = ['pclass','survived','sex','age','sibsp','parch','fare']

In [6]:
cleaned_doc = df1[useful_cols]

In [7]:
avg_age = cleaned_doc['age'].mean()

In [8]:
cleaned_doc=cleaned_doc.fillna(avg_age)

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()

In [11]:
cleaned_doc['sex'] = le.fit_transform(cleaned_doc['sex'])

In [12]:
cleaned_doc.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,29.838978,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,29.838978,2.0,0.0,23.25


In [13]:
Y = cleaned_doc['survived']
X  = cleaned_doc[['pclass','sex','age','sibsp','parch','fare']]
print(X.shape,Y.shape)

(1009, 6) (1009,)


In [14]:
###Calculating Entropy
def entropy(cols):
    counts = np.unique(cols,return_counts=True)[1]
    
    N = float(cols.shape[0])
    
    ent = 0.0
    
    for ix in counts:
        pi = ix/N
        ent+=(-1.0*pi*np.log2(pi))
       
    
    return ent
        


In [15]:
###Dividing datasets on the basis of given col
def divide_data(X,fkey,fval):
    x_left = pd.DataFrame([],columns=X.columns)
    x_right = pd.DataFrame([],columns=X.columns)
    
    for i in range(X.shape[0]):
        val = X.iloc[i][fkey] 
        if val<=fval:
            x_left = x_left.append(X.iloc[i],ignore_index=True)
        else:
            x_right = x_right.append(X.iloc[i],ignore_index=True)
    
    return x_left,x_right

In [16]:
### Information Gain
def info_gain(X,fkey,fval):
    x_left,x_right = divide_data(X,fkey,fval)
    fraction_left = float(x_left.shape[0])/X.shape[0]
    fraction_right = float(x_right.shape[0])/X.shape[0]
    
    ###If one side is empty then return the least information gain
    if x_left.shape[0]==0 or x_right.shape[0]==0:
        return -1000000
    return entropy(X.survived) -(fraction_left*entropy(x_left.survived)+fraction_right*entropy(x_right.survived))


In [17]:
for fx in cleaned_doc.columns:
    print(fx)
    print(info_gain(cleaned_doc,fx,cleaned_doc[fx].mean()))

pclass
0.055456910002982474
survived
0.9570419991692567
sex
0.19274737190850932
age
0.0010525742338489685
sibsp
0.006492394392888956
parch
0.01975608012294816
fare
0.04242793401428169


In [110]:
class decisionTree():
    def __init__(self,depth=0,max_depth=4):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.depth = depth
        self.target = None
        self.max_depth = max_depth
    
    def fit(self,X):
        feature = ['pclass','sex','age','sibsp','parch','fare']
        info = [] 
        for features in feature:
            info.append(info_gain(X,features,X[features].mean()))
        best_feature = np.argmax(info)
        self.fkey = feature[best_feature]
        self.fval = X[self.fkey].mean()
        
        print("the root is "+self.fkey)
        
        
        ###Split Data
        x_left,x_right = divide_data(X,self.fkey,self.fval)
        
        ###IF the node is leave node
        if x_left.shape[0]==0 or x_right.shape[0]==0:
            if X.survived.mean()>=0.5:
                self.target = "survived"
            else:
                self.target = "dead"
            return
        
        ###IF max depth has been reached
        
        if self.depth>=self.max_depth:
            if X.survived.mean()>=0.5:
                self.target = "survived"
            else:
                self.target = "dead"
            return
        
        self.left = decisionTree(depth=self.depth+1,max_depth = self.max_depth)
        self.left.fit(x_left)
        
        self.right = decisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.fit(x_right)
        
        if X.survived.mean()>=0.5:
            self.target = 'survived'
        else:
            self.target = 'dead'
        
        
    def predict(self,X_test):
        if self.fval>=X_test[self.fkey]:
            if self.right is None:
                return self.target
            return self.right.predict(X_test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(X_test)
        
        
        
        
    
        
        

In [111]:
dT = decisionTree()
dT.fit(train_data)

the root is sex
the root is pclass
the root is parch
the root is fare
the root is fare
the root is age
the root is fare
the root is pclass
the root is age
the root is sibsp
the root is fare
the root is fare
the root is age
the root is fare
the root is parch
the root is fare
the root is fare
the root is parch
the root is sibsp
the root is age
the root is age
the root is fare
the root is parch
the root is age
the root is fare
the root is age
the root is sibsp
the root is pclass
the root is pclass
the root is fare
the root is pclass


In [112]:
split = int(0.7*cleaned_doc.shape[0])
train_data = cleaned_doc[:split]
test_data = cleaned_doc[split:]
test_data = test_data.reset_index(drop=True)

In [113]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dT.predict(test_data.loc[ix]))

y_actual = test_data['survived']
y_pred

['survived',
 'dead',
 'survived',
 'dead',
 'survived',
 'dead',
 'survived',
 'dead',
 'dead',
 'dead',
 'survived',
 'dead',
 'dead',
 'survived',
 'survived',
 'survived',
 'survived',
 'survived',
 'dead',
 'survived',
 'dead',
 'survived',
 'dead',
 'survived',
 'dead',
 'survived',
 'survived',
 'survived',
 'dead',
 'dead',
 'dead',
 'survived',
 'survived',
 'dead',
 'dead',
 'survived',
 'dead',
 'survived',
 'survived',
 'survived',
 'dead',
 'dead',
 'dead',
 'survived',
 'survived',
 'survived',
 'survived',
 'survived',
 'dead',
 'dead',
 'survived',
 'survived',
 'survived',
 'dead',
 'dead',
 'dead',
 'survived',
 'dead',
 'survived',
 'survived',
 'survived',
 'survived',
 'dead',
 'dead',
 'survived',
 'survived',
 'survived',
 'survived',
 'survived',
 'survived',
 'dead',
 'survived',
 'survived',
 'dead',
 'dead',
 'dead',
 'survived',
 'survived',
 'survived',
 'dead',
 'survived',
 'dead',
 'survived',
 'survived',
 'survived',
 'survived',
 'survived',
 'dead',


In [114]:
le = LabelEncoder()
y_pred = le.fit_transform(y_pred)

In [115]:
acc = np.sum(y_pred==y_actual)/y_pred.shape[0]
acc

0.42244224422442245

In [51]:
acc = np.sum(np.array(y_pred)==np.array(y_actual))/y_pred.shape[0]

In [52]:
acc

0.32673267326732675

# Using Sc-Kit for decision tree 

In [53]:
from sklearn.tree import DecisionTreeClassifier

In [106]:
dt_classifier = DecisionTreeClassifier(criterion='gini',max_depth=5)

In [107]:
dt_classifier.fit(train_data[['pclass','sex','age','sibsp','parch','fare']],train_data['survived'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [108]:
prediction = dt_classifier.predict(test_data[['pclass','sex','age','sibsp','parch','fare']])

In [109]:
dt_classifier.score(test_data[['pclass','sex','age','sibsp','parch','fare']],test_data['survived'])

0.7755775577557755