In [56]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [57]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [58]:
train['Age'] = train['Age'].fillna(train['Age'].median())
train['Sex'] = train['Sex'].map({'male':1,'female':0})
train['Embarked'] = train['Embarked'].map({'S':3,'C':2,'Q':1})
train.drop(['PassengerId','Name','Ticket','Cabin'],axis="columns",inplace=True)

In [59]:
test['Age'] = test['Age'].fillna(test['Age'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())
test.drop(['PassengerId','Name','Ticket','Cabin'],axis="columns",inplace=True)
test['Sex'] = test['Sex'].map({'male':1,'female':0})
test['Embarked'] = test['Embarked'].map({'S':3,'C':2,'Q':1})
# test.drop(['Name','Ticket','Cabin','SibSp','Parch'],axis="columns",inplace=True)

In [19]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,3
1,1,1,0,38.0,1,0,71.2833,2
2,1,3,0,26.0,0,0,7.9250,3
3,1,1,0,35.0,1,0,53.1000,3
4,0,3,1,35.0,0,0,8.0500,3
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,3
887,1,1,0,19.0,0,0,30.0000,3
888,0,3,0,28.0,1,2,23.4500,3
889,1,1,1,26.0,0,0,30.0000,2


In [20]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0000,3
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,3
4,3,0,22.0,1,1,12.2875,3
...,...,...,...,...,...,...,...
413,3,1,27.0,0,0,8.0500,3
414,1,0,39.0,0,0,108.9000,2
415,3,1,38.5,0,0,7.2500,3
416,3,1,27.0,0,0,8.0500,3


In [60]:
X_train = train.iloc[:,1:]
Y_train = train.iloc[:,0]
X_test = test.iloc[:,:]

In [22]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.2500,3
1,1,0,38.0,1,0,71.2833,2
2,3,0,26.0,0,0,7.9250,3
3,1,0,35.0,1,0,53.1000,3
4,3,1,35.0,0,0,8.0500,3
...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,3
887,1,0,19.0,0,0,30.0000,3
888,3,0,28.0,1,2,23.4500,3
889,1,1,26.0,0,0,30.0000,2


In [23]:
Y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [24]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0000,3
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,3
4,3,0,22.0,1,1,12.2875,3
...,...,...,...,...,...,...,...
413,3,1,27.0,0,0,8.0500,3
414,1,0,39.0,0,0,108.9000,2
415,3,1,38.5,0,0,7.2500,3
416,3,1,27.0,0,0,8.0500,3


In [61]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,6])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,6])], remainder='passthrough')
X_test = np.array(ct.fit_transform(X_test))

In [51]:
from sklearn.model_selection import train_test_split
data_tr, data_te, result_tr, result_te = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)

In [52]:
clf_gini = DecisionTreeClassifier(criterion = "gini",random_state=0,max_depth=3, min_samples_leaf=5)
clf_gini.fit(data_tr,result_tr)
clf_gini_pred = clf_gini.predict(data_te)

In [53]:
print(confusion_matrix(result_te,clf_gini_pred))
print(accuracy_score(result_te,clf_gini_pred)*100)
print(classification_report(result_te,clf_gini_pred))

[[96 14]
 [17 52]]
82.68156424581005
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       110
           1       0.79      0.75      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179



In [54]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy",random_state=0,max_depth=3, min_samples_leaf=5)
clf_entropy.fit(data_tr,result_tr)
clf_entropy_pred = clf_entropy.predict(data_te)

In [55]:
print(confusion_matrix(result_te,clf_entropy_pred))
print(accuracy_score(result_te,clf_entropy_pred)*100)
print(classification_report(result_te,clf_entropy_pred))

[[96 14]
 [17 52]]
82.68156424581005
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       110
           1       0.79      0.75      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179



In [62]:
clf_gini_test = DecisionTreeClassifier(criterion = "gini",random_state=0,max_depth=3, min_samples_leaf=5)
clf_gini_test.fit(X_train,Y_train)
clf_gini_test_pred = clf_gini_test.predict(X_test)

In [63]:
# print(confusion_matrix(result_te,clf_gini_test_pred))
# print(accuracy_score(result_te,clf_gini_test_pred)*100)
# print(classification_report(result_te,clf_gini_test_pred))
pd.DataFrame(clf_gini_test_pred).to_csv('ginisol.csv')

In [64]:
clf_entropy_test = DecisionTreeClassifier(criterion = "entropy",random_state=0,max_depth=3, min_samples_leaf=5)
clf_entropy_test.fit(X_train,Y_train)
clf_entropy_test_pred = clf_entropy_test.predict(X_test)

In [65]:
pd.DataFrame(clf_entropy_test_pred).to_csv('entropysol.csv')