In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn import tree

In [2]:
df = pd.read_csv('data/titanic.csv')

In [3]:
df.columns

Index(['row.names', 'pclass', 'survived', 'name', 'age', 'embarked',
       'home.dest', 'room', 'ticket', 'boat', 'sex'],
      dtype='object')

In [4]:
X_cols = ['pclass', 'name', 'age', 'embarked', 'home.dest', 'room', 'ticket', 'boat', 'sex']
y_cols = ['survived']

In [5]:
#working on some pre-preprocessing first
X_cols_selected = ['pclass', 'age', 'sex']

In [6]:
titanic_X = df[X_cols_selected]
titanic_y = df[y_cols]

Checking null of age column

In [7]:
print(titanic_X.shape)
titanic_X[titanic_X['age'].isnull() == True].count()

(1313, 3)


pclass    680
age         0
sex       680
dtype: int64

In [8]:
age = titanic_X['age'].copy()
age_mean = age.mean()
titanic_X['age'].fillna(age_mean, inplace=True)
titanic_X[titanic_X['age'].isnull() == True].count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


pclass    0
age       0
sex       0
dtype: int64

## Encoding 

### Labeling

In [9]:
enc = LabelEncoder()

label_encoder = enc.fit(titanic_X['sex'])

titanic_X['gender_labeled'] = label_encoder.transform(titanic_X['sex'])

titanic_X[['sex', 'gender_labeled']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,sex,gender_labeled
0,female,0
1,female,0
2,male,1
3,female,0
4,male,1


### One Hot Encoding

In [10]:
titanic_X['pclass'].unique()

array(['1st', '2nd', '3rd'], dtype=object)

In [11]:
one_hot_columns = pd.get_dummies(titanic_X['pclass'], prefix='pclass')
titanic_X = pd.concat([titanic_X,one_hot_columns], axis=1)
titanic_X.head()

Unnamed: 0,pclass,age,sex,gender_labeled,pclass_1st,pclass_2nd,pclass_3rd
0,1st,29.0,female,0,1,0,0
1,1st,2.0,female,0,1,0,0
2,1st,30.0,male,1,1,0,0
3,1st,25.0,female,0,1,0,0
4,1st,0.9167,male,1,1,0,0


## Training a Decision Tree

In [12]:
titanic_X.drop(['sex', 'pclass'], axis=1, inplace=True)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(titanic_X, titanic_y,  test_size=0.25, random_state=33)

In [14]:
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
clf = clf.fit(X_train,y_train)

In [18]:
import pydot,StringIO
dot_data = StringIO.StringIO() 
tree.export_graphviz(clf, out_file=dot_data, 
    feature_names=['age','gender_label','pclass_1st', 'pclass_2nd', 'pclass_3rd']) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_png('titanic.png') 
from IPython.core.display import Image 
Image(filename='titanic.png')

ModuleNotFoundError: No module named 'pydot'

In [24]:
from sklearn import metrics

def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
    
    y_pred=clf.predict(X)   
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")

    if (show_classification_report):
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confussion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")
        
        
measure_performance(X_train,y_train,clf, show_classification_report=False, show_confusion_matrix=False)

Accuracy:0.838 

