In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.metrics import confusion_matrix,precision_recall_curve,accuracy_score,mean_absolute_error
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [4]:
link = "https://drive.google.com/file/d/1jnyDD_KVesjmuVPEFKthkcgNZ6QOcqCt/view?usp=drive_link"
path='https://drive.google.com/uc?id=1jnyDD_KVesjmuVPEFKthkcgNZ6QOcqCt'
titanic = pd.read_csv(path)

In [5]:
(titanic['survived']).value_counts()

0    384
1    243
Name: survived, dtype: int64

## Data Preprocessing

In [6]:
dropped_titanic=titanic.drop(['Unnamed: 0','deck','survived'],axis=1)
cat_titanic=dropped_titanic.drop(['age','fare'],axis=1)
encoder=OneHotEncoder()
one_hot_encoded=encoder.fit_transform(cat_titanic)
one_hot_df = pd.DataFrame.sparse.from_spmatrix(one_hot_encoded,
                                               columns=encoder.get_feature_names_out(list(cat_titanic)))
scaler=StandardScaler()
scaled_titanic=scaler.fit_transform(dropped_titanic[['age','fare']])
X=pd.concat([pd.DataFrame(scaled_titanic,columns=['age','fare']),one_hot_df],axis=1)
y=titanic['survived']

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)

In [8]:
tree_clf=DecisionTreeClassifier()
tree_clf.fit(X_train,y_train)



### Accuracy on training set

In [9]:
y_train_pred=tree_clf.predict(X_train)
accuracy_score(y_train_pred,y_train)



0.9840182648401826

### Accuracy on testing set

In [10]:
y_test_pred=tree_clf.predict(X_test)
accuracy_score(y_test_pred,y_test)



0.7142857142857143

In [11]:
tree2=DecisionTreeClassifier(criterion='gini',random_state=42)

### Grid search

In [12]:
X_train_sparse = csr_matrix(X_train)
param_grid=[{'max_depth':[5,10,15,20],'min_samples_leaf':[3,5,7,9],'max_features':[2,3,4,5]}]
grid_search=GridSearchCV(tree2,param_grid,cv=5)
grid_search.fit(X_train_sparse,y_train)
best_model=grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Accuracy: 0.7724867724867724
Best Hyperparameters: {'max_depth': 5, 'max_features': 4, 'min_samples_leaf': 7}




In [13]:
X_train_sparse = csr_matrix(X_train)
param_grid=[{'max_depth':[8,9,10,12],'min_samples_leaf':[1,2,3,4,5],'max_features':[5,7,9,11]}]
grid_search=GridSearchCV(tree2,param_grid,cv=5)
grid_search.fit(X_train_sparse,y_train)
best_model=grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


Accuracy: 0.8095238095238095
Best Hyperparameters: {'max_depth': 12, 'max_features': 11, 'min_samples_leaf': 4}




In [14]:
X_train_sparse = csr_matrix(X_train)
param_grid=[{'max_depth':[11,12,14,16],'min_samples_leaf':[2,3,4,5],'max_features':[11,13,15,17]}]
grid_search=GridSearchCV(tree2,param_grid,cv=5)
grid_search.fit(X_train_sparse,y_train)
best_model=grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Accuracy: 0.8095238095238095
Best Hyperparameters: {'max_depth': 11, 'max_features': 11, 'min_samples_leaf': 4}




In [15]:
accuracy_score(best_model.predict(X_train),y_train)



0.8584474885844748