### Step1: Load Dataset and analyse it

In [1]:
import numpy as np
import pandas as pd
train_data=pd.read_csv("Train.csv")
train_data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
pclass       1009 non-null float64
survived     1009 non-null float64
name         1009 non-null object
sex          1009 non-null object
age          812 non-null float64
sibsp        1009 non-null float64
parch        1009 non-null float64
ticket       1009 non-null object
fare         1008 non-null float64
cabin        229 non-null object
embarked     1008 non-null object
boat         374 non-null object
body         98 non-null float64
home.dest    582 non-null object
dtypes: float64(7), object(7)
memory usage: 110.4+ KB


### Step2: Data cleaning and fill NaN values

In [3]:
columns_to_drop=["name","ticket","cabin","embarked","boat","body","home.dest"]
train_data_clean=train_data.drop(columns_to_drop,axis=1)
train_data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


In [4]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
train_data_clean["sex"]=le.fit_transform(train_data_clean["sex"])

In [5]:
print(train_data_clean.head())
print(train_data_clean.info())

   pclass  survived  sex   age  sibsp  parch    fare
0     3.0       0.0    0   NaN    0.0    0.0   7.750
1     2.0       0.0    1  39.0    0.0    0.0  26.000
2     2.0       1.0    0  40.0    0.0    0.0  13.000
3     3.0       1.0    0  31.0    1.0    1.0  20.525
4     3.0       1.0    0   NaN    2.0    0.0  23.250
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
pclass      1009 non-null float64
survived    1009 non-null float64
sex         1009 non-null int64
age         812 non-null float64
sibsp       1009 non-null float64
parch       1009 non-null float64
fare        1008 non-null float64
dtypes: float64(6), int64(1)
memory usage: 55.3 KB
None


In [6]:
#fill Nan values
train_data_clean=train_data_clean.fillna(train_data_clean["age"].mean())

In [7]:
train_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
pclass      1009 non-null float64
survived    1009 non-null float64
sex         1009 non-null int64
age         1009 non-null float64
sibsp       1009 non-null float64
parch       1009 non-null float64
fare        1009 non-null float64
dtypes: float64(6), int64(1)
memory usage: 55.3 KB


### Step3: Implement entropy function

In [9]:
def entropy(col):
    counts=np.unique(col,return_counts=True)
    N=float(col.shape[0])
    ent=0.0
    
    for ix in counts[1]:
        p=ix/N
        ent+=(-1.0*p*np.log2(p))
        
    return ent    

### Step4: Implement Information gain

In [10]:
#divide data based on one feature according to its threshold value
def divide_data(x_data,fkey,fval):
    x_right=pd.DataFrame([],columns=x_data.columns)
    x_left=pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val=x_data[fkey].loc[ix]
        if val>fval:
            x_right=x_right.append(x_data.loc[ix])
        else:
            x_left=x_left.append(x_data.loc[ix])
        
    return x_left,x_right    

In [11]:
#Information Gain function
def information_gain(x_data,fkey,fval):
    left,right=divide_data(x_data,fkey,fval)
    
    # %age of total samples on left and right side
    l=float(left.shape[0])/x_data.shape[0]
    r=float(right.shape[0])/x_data.shape[0]
    
    #check if all examples came to one side
    if left.shape[0]==0 or right.shape[0]==0:
        return -10000000 #Minimum information gain
    
    i_gain=entropy(x_data.survived)-(l*entropy(left.survived) + r*entropy(right.survived))
    return i_gain

In [14]:
class DecisionTree:
    
    #constructor
    def __init__(self,depth=0,max_depth=5):
        self.left=None
        self.right=None
        self.fkey=None
        self.fval=None
        self.max_depth=max_depth
        self.depth=depth
        self.target=None
        
    def train(self,X_train):
        features=['pclass','sex','age','sibsp','parch','fare']
        info_gains=[]
        
        for ix in features:
            i_gain=information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey=features[np.argmax(info_gains)]
        self.fval=X_train[self.fkey].mean()
        print("Making Tree with Feature:",self.fkey)
        
        #Split Data
        data_left,data_right=divide_data(X_train,self.fkey,self.fval)
        data_left=data_left.reset_index(drop=True)
        data_right=data_right.reset_index(drop=True)
        
        #Check if a leaf node
        if data_left.shape[0]==0 or data_right.shape[0]==0:
            if X_train.survived.mean()>=0.5:
                self.target=1.0
            else:
                self.target=0.0
            return
        
        #Stop early when depth>=max_depth
        if self.depth>=self.max_depth:
            if X_train.survived.mean()>=0.5:
                self.target=1.0
            else:
                self.target=0.0
            return
        
        #Recursive case
        self.left=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        self.right=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        #Set the target at every node
        if X_train.survived.mean()>=0.5:
                self.target=1.0
        else:
            self.target=0.0
        return
    
    #Making Predictions 
    def predict(self,test):
        
        if test[self.fkey]>self.fval:
            #go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        
        else:
            #go to left:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [15]:
dt=DecisionTree()
dt.train(train_data_clean)

Making Tree with Feature: sex
Making Tree with Feature: pclass
Making Tree with Feature: pclass
Making Tree with Feature: fare
Making Tree with Feature: sibsp
Making Tree with Feature: age
Making Tree with Feature: age
Making Tree with Feature: sibsp
Making Tree with Feature: age
Making Tree with Feature: sibsp
Making Tree with Feature: parch
Making Tree with Feature: fare
Making Tree with Feature: sibsp
Making Tree with Feature: fare
Making Tree with Feature: age
Making Tree with Feature: age
Making Tree with Feature: parch
Making Tree with Feature: parch
Making Tree with Feature: sibsp
Making Tree with Feature: fare
Making Tree with Feature: fare
Making Tree with Feature: fare
Making Tree with Feature: age
Making Tree with Feature: sibsp
Making Tree with Feature: sibsp
Making Tree with Feature: fare
Making Tree with Feature: fare
Making Tree with Feature: age
Making Tree with Feature: fare
Making Tree with Feature: fare
Making Tree with Feature: fare
Making Tree with Feature: age
Mak

In [16]:
print(dt.fkey)
print(dt.fval)
print(dt.left.fkey,dt.right.fkey)

sex
0.6422200198216056
pclass fare


In [17]:
test_data=pd.read_csv("Test.csv")
test_data.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Flynn, Mr. John Irwin (""Irving"")",male,36.0,0.0,0.0,PC 17474,26.3875,E25,S,5.0,,"Brooklyn, NY"
1,3.0,"Sage, Miss. Constance Gladys",female,,8.0,2.0,CA. 2343,69.55,,S,,,
2,1.0,"Rood, Mr. Hugh Roscoe",male,,0.0,0.0,113767,50.0,A32,S,,,"Seattle, WA"
3,2.0,"Gillespie, Mr. William Henry",male,34.0,0.0,0.0,12233,13.0,,S,,,"Vancouver, BC"
4,2.0,"Collander, Mr. Erik Gustaf",male,28.0,0.0,0.0,248740,13.0,,S,,,"Helsinki, Finland Ashtabula, Ohio"


In [18]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 13 columns):
pclass       300 non-null float64
name         300 non-null object
sex          300 non-null object
age          234 non-null float64
sibsp        300 non-null float64
parch        300 non-null float64
ticket       300 non-null object
fare         300 non-null float64
cabin        66 non-null object
embarked     299 non-null object
boat         112 non-null object
body         23 non-null float64
home.dest    163 non-null object
dtypes: float64(6), object(7)
memory usage: 30.5+ KB


In [19]:
columns_to_drop=["name","ticket","cabin","embarked","boat","body","home.dest"]
test_data_clean=test_data.drop(columns_to_drop,axis=1)
test_data_clean.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
0,1.0,male,36.0,0.0,0.0,26.3875
1,3.0,female,,8.0,2.0,69.55
2,1.0,male,,0.0,0.0,50.0
3,2.0,male,34.0,0.0,0.0,13.0
4,2.0,male,28.0,0.0,0.0,13.0


In [20]:
le=LabelEncoder()
test_data_clean["sex"]=le.fit_transform(test_data_clean["sex"])

In [21]:
print(test_data_clean.head())
print(test_data_clean.info())

   pclass  sex   age  sibsp  parch     fare
0     1.0    1  36.0    0.0    0.0  26.3875
1     3.0    0   NaN    8.0    2.0  69.5500
2     1.0    1   NaN    0.0    0.0  50.0000
3     2.0    1  34.0    0.0    0.0  13.0000
4     2.0    1  28.0    0.0    0.0  13.0000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
pclass    300 non-null float64
sex       300 non-null int64
age       234 non-null float64
sibsp     300 non-null float64
parch     300 non-null float64
fare      300 non-null float64
dtypes: float64(5), int64(1)
memory usage: 14.1 KB
None


In [22]:
#fill Nan values
test_data_clean=test_data_clean.fillna(test_data_clean["age"].mean())
test_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
pclass    300 non-null float64
sex       300 non-null int64
age       300 non-null float64
sibsp     300 non-null float64
parch     300 non-null float64
fare      300 non-null float64
dtypes: float64(5), int64(1)
memory usage: 14.1 KB


In [23]:
y_pred=[]
for ix in range(test_data_clean.shape[0]):
    y_pred.append(dt.predict(test_data_clean.loc[ix]))

In [24]:
df=pd.DataFrame(y_pred,columns=["survived"])
df.index.name="Id"
df.to_csv("submit.csv",index=True)

## Decision Tree Sklearn Implementation

In [25]:
from sklearn.tree import DecisionTreeClassifier
sk_tree=DecisionTreeClassifier(criterion='entropy')
sk_tree.fit(train_data_clean[input_cols],train_data_clean[output_cols])
y_pred=sk_tree.predict(test_data_clean[input_cols])

In [26]:
df=pd.DataFrame(y_pred,columns=["survived"])
df.index.name="Id"
df.to_csv("submit.csv",index=True)

## Random Forest Implementation

In [27]:
#Random Forest
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=30,criterion="entropy")
rf.fit(train_data_clean[input_cols],train_data_clean[output_cols])
y_pred=rf.predict(test_data_clean[input_cols])

  """


In [28]:
df=pd.DataFrame(y_pred,columns=["survived"])
df.index.name="Id"
df.to_csv("submit.csv",index=True)