In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [9]:
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanicData = pd.read_csv(url)

In [10]:
titanicData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
features = ['Pclass', 'Sex', 'Age', 'Fare']
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex']
x = titanicData[features]
y = titanicData[['Survived']]
print(x.head())
print(y.head())

   Pclass     Sex   Age     Fare
0       3    male  22.0   7.2500
1       1  female  38.0  71.2833
2       3  female  26.0   7.9250
3       1  female  35.0  53.1000
4       3    male  35.0   8.0500
   Survived
0         0
1         1
2         1
3         1
4         0


In [23]:
x[categorical_features].nunique()

Pclass    3
Sex       2
dtype: int64

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy= 'mean'), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [29]:
X = preprocessor.fit_transform(x)
X


array([[22.        ,  7.25      ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [38.        , 71.2833    ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [26.        ,  7.925     ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [29.69911765, 23.45      ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [26.        , 30.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [32.        ,  7.75      ,  0.        , ...,  1.        ,
         0.        ,  1.        ]])

In [30]:
#splitting the data into training and testing sets

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# train the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(x_train, y_train)

In [40]:
# make predictions on the training test data
y_pred = clf.predict(x_train)

# evaluate the accuracy
accuracy = accuracy_score(y_train, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 97.89%


In [41]:
# make predictions on the test data
y_pred = clf.predict(x_test)

# evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 74.86%
