In [52]:
import pandas as pd

## Reading the dataset

In [53]:
df = pd.read_csv('titanic.csv')

In [54]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest
0,1,1,Allen Miss. Elisabeth Walton,female,29.0,0,0,24160,211.3375,B5,S,2.0,,St Louis MO
1,1,1,Allison Master. Hudson Trevor,male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,Montreal PQ / Chesterville ON
2,1,0,Allison Miss. Helen Loraine,female,2.0,1,2,113781,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON
3,1,0,Allison Mr. Hudson Joshua Creighton,male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,Montreal PQ / Chesterville ON
4,1,0,Allison Mrs. Hudson J C (Bessie Waldo Daniels),female,25.0,1,2,113781,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON


## Dropping Unwanted Columns

In [55]:
colsToDrop = ['name','cabin','boat','body','home_dest','ticket']

In [56]:
df.drop(colsToDrop, inplace=True,axis=1)

In [58]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


## Changing the datatypes

In [59]:
df.dtypes

pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object

In [60]:
df['pclass'] = df['pclass'].astype('category')
df['sex'] = df['sex'].astype('category')
df['embarked'] = df['embarked'].astype('category')
df['survived'] = df['survived'].astype('category')

In [61]:
df.dtypes

pclass      category
survived    category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

## Handling Missing Information

In [64]:
df.isnull().sum()

pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [14]:
df.sibsp.isnull().sum()

0

In [65]:
for col in df:
    if df[col].isnull().sum() > 0:
        if str(df[col].dtype) == 'category':
            df[col] = df[col].fillna(value = df[col].mode()[0])
        else:
            #print(col)
            df[col] = df[col].fillna(value = df[col].mean())

In [66]:
df.isnull().sum().sum()

0

## Seperating Target Column

In [67]:
y = df['survived']

In [68]:
X = df.drop(['survived'],axis = 1)

In [69]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,1,female,29.0,0,0,211.3375,S
1,1,male,0.9167,1,2,151.55,S
2,1,female,2.0,1,2,151.55,S
3,1,male,30.0,1,2,151.55,S
4,1,female,25.0,1,2,151.55,S


In [70]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, int64): [0, 1]

In [22]:
X.dtypes

pclass      category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

## Converting Categorical to Numeric

In [71]:
X.dtypes

pclass      category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

In [72]:
X = pd.get_dummies(X,drop_first=True)

In [73]:
X.head()

Unnamed: 0,age,sibsp,parch,fare,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S
0,29.0,0,0,211.3375,0,0,0,0,1
1,0.9167,1,2,151.55,0,0,1,0,1
2,2.0,1,2,151.55,0,0,0,0,1
3,30.0,1,2,151.55,0,0,1,0,1
4,25.0,1,2,151.55,0,0,0,0,1


## Kfold

In [74]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score,recall_score

In [92]:
clf = DecisionTreeClassifier(max_depth=4,min_samples_leaf=5)

In [100]:
kf = KFold(n_splits = 4,shuffle=True)

In [101]:
prec = []
rec = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train,y_train)
    preds = clf.predict(X_test)
    prec_Score = precision_score(y_test,preds)
    rec_Score = recall_score(y_test,preds)
    prec.append(prec_Score)
    rec.append(rec_Score)
    

In [102]:
print(np.mean(prec), np.std(prec))
print(np.mean(rec), np.std(rec))

0.787586411026 0.0650497866955
0.66792365307 0.0692843507318


## Stratified Kfold

In [96]:
kf = StratifiedKFold(n_splits = 4,shuffle=True)

In [98]:
prec = []
rec = []
for train_index, test_index in kf.split(X,y):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train,y_train)
    preds = clf.predict(X_test)
    prec_Score = precision_score(y_test,preds)
    rec_Score = recall_score(y_test,preds)
    prec.append(prec_Score)
    rec.append(rec_Score)
    

In [99]:
print(np.mean(prec), np.std(prec))
print(np.mean(rec), np.std(rec))

0.811841057493 0.0321566839662
0.624 0.0944880944881


## CrossVal Score

In [103]:
from sklearn.model_selection import cross_val_score

In [105]:
cross_val_score(clf,X,y,cv=4,n_jobs=-1,scoring='f1')

array([ 0.65789474,  0.7826087 ,  0.66949153,  0.22368421])

## Train Test Split

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size=0.3, random_state=123)

In [35]:
X.shape

(1309, 9)

In [36]:
X_train.shape

(916, 9)

In [40]:
X_test.shape

(393, 15)

## Model Building

In [31]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [32]:
clf= DecisionTreeClassifier(criterion='gini',max_depth=4)

In [51]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [52]:
preds = clf.predict(X_test)

In [53]:
preds

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0,

In [33]:
from sklearn.metrics import classification_report,confusion_matrix

In [49]:
confusion_matrix(y_test,preds)

array([[201,  43],
       [ 39, 110]], dtype=int64)

In [54]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.83      0.89      0.86       244
          1       0.80      0.70      0.75       149

avg / total       0.82      0.82      0.82       393



In [37]:
clf = RandomForestClassifier()#n_estimators=10,max_depth=

In [57]:
clf = AdaBoostClassifier(n_estimators=50)

In [72]:
clf = GradientBoostingClassifier(n_estimators=1000,verbose=True,max_depth=4,min_samples_leaf=5)

In [58]:
clf.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

## Visualize the tree

In [55]:
with open("decisiontree.dot", 'w') as f:
    f = export_graphviz(clf, out_file=f,feature_names=X.columns.values,filled=True, rounded=True,special_characters=True,class_names=['0','1'], proportion=True)

type the following code in the command line

dot -Tpng decisiontree.dot -o outfile.png