Importing required libraries

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV 
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

  import pandas.util.testing as tm


Loading data

In [0]:
titanic_data = pd.read_csv('titanic.csv')
titanic_data.head()

In [0]:
titanic_data.shape

Data Preprocessing

In [0]:
print(titanic_data.isna().sum())

In [0]:
titanic_data = titanic_data.drop(['Name', 'Cabin','Ticket'], axis = 1) 

Replacing missing values with mean

In [0]:
titanic_data.fillna(titanic_data.mean(), inplace=True)

In [0]:
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

Making Dataframe

In [0]:
df = pd.DataFrame(titanic_data)
df.head()

Handling Categorical Values

In [0]:
# Dummy Variables - Gender

df = pd.get_dummies(df , columns = ['Sex'])

In [0]:
# Dummy Variables - Pclass

df = pd.get_dummies(df , columns = ['Pclass'])

In [0]:
# Dummy Variables - Embarked

df = pd.get_dummies(df , columns = ['Embarked'])

Divide data into independent and dependent

In [0]:
X= df
X = X.drop(['Survived','PassengerId'], axis = 1)

In [0]:
y = df['Survived']

In [0]:
X.head()

In [0]:
y.head()

Divide Train and Test data

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)


In [0]:
# Instantiating the classifier and fitting it 
xgb_clf = xgb.XGBClassifier(learning_rate=0.02, n_estimators = 1000, max_depth = 2,subsample=0.8, colsample_bytree=0.5,gamma=0)
xgb_clf.fit(X_train, y_train)

In [0]:
# Applying classifier on test (unseen) set
predictions = xgb_clf.predict(X_test)
# accuracy score
accuracy = accuracy_score(y_test, predictions)
print(np.round(accuracy*100, 2), '%')

In [0]:
print(confusion_matrix(y_test, predictions))
print("="*60)
print(classification_report(y_test, predictions))


In [0]:
from sklearn.metrics import roc_curve  
probs = xgb_clf.predict_proba(X_test)  
probs = probs[:, 1]  
fper, tper, thresholds = roc_curve(y_test, probs) 
plt.plot(fper, tper)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBoost ROC curve')
# show the plot
plt.show()