<a href="https://www.kaggle.com/code/mariushinsberger/titanic-first-approaches?scriptVersionId=160478490" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Imports

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Load Data

In [3]:
path = str("/kaggle/input/titanic")
train_data = pd.read_csv(f"{path}/train.csv")
test_data = pd.read_csv(f"{path}/test.csv")
submission_data = pd.read_csv(f"{path}/gender_submission.csv")

# Process data

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_data.shape

(891, 12)

In [6]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [10]:
def clean_data(df):
    # Rename columns.
    df.rename(columns={"PassengerId": "passenger_id", "Survived": "survived", "Pclass": "p_class", "Name": "name", "Sex": "sex", "Age": "age", "SibSp": "sib_sp",
           "Parch": "parch", "Ticket": "ticket", "Fare": "fare", "Cabin": "cabin", "Embarked": "embarked"}, inplace=True)
    # Drop unneeded columns.
    df.drop(columns=["name", "ticket", "cabin", "passenger_id"], inplace=True)
    # Add feature median to missing categorical values.
    df["embarked"] = df["embarked"].astype(str)
    df["age"].fillna(df["age"].median(), inplace=True)
    df["embarked"].fillna(df["embarked"].mode(), inplace=True)
    # Transform categorical values.
    df["p_class"] = le.fit_transform(df["p_class"].values)
    df["sex"] = le.fit_transform(df["sex"].values)
    df["sib_sp"] = le.fit_transform(df["sib_sp"].values)
    df["parch"] = le.fit_transform(df["parch"].values)
    df["embarked"] = le.fit_transform(df["embarked"].values)
    df["fare"] = le.fit_transform(df["fare"].values)
    # Change datatype of numerical values to integer.
    df["age"] = df["age"].astype(int)
    df["fare"] = df["fare"].astype(int)
    # return clean dataframe.
    return df

In [11]:
le = LabelEncoder()
train_df = clean_data(train_data)
test_df = clean_data(test_data)

In [12]:
train_data.isna().sum()

survived    0
p_class     0
sex         0
age         0
sib_sp      0
parch       0
fare        0
embarked    0
dtype: int64

In [13]:
test_data.isna().sum()

p_class     0
sex         0
age         0
sib_sp      0
parch       0
fare        0
embarked    0
dtype: int64

In [14]:
train_data.head()

Unnamed: 0,survived,p_class,sex,age,sib_sp,parch,fare,embarked
0,0,2,1,22,1,0,18,2
1,1,0,0,38,1,0,207,0
2,1,2,0,26,0,0,41,2
3,1,0,0,35,1,0,189,2
4,0,2,1,35,0,0,43,2


In [15]:
train_df.corrwith(train_df["survived"]).abs().sort_values(ascending=False)

survived    1.000000
sex         0.543351
p_class     0.338481
fare        0.333943
embarked    0.163517
parch       0.081629
age         0.064909
sib_sp      0.026385
dtype: float64

# Build model

In [16]:
X_train = train_df.drop(columns=["survived"])
y_train = train_df["survived"]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [17]:
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    early_stopping_rounds=50,
    max_depth=6,
    min_child_weight=2,
    gamma=0.5,
    subsample=0.65,
    colsample_bytree=1.0,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb1.fit(X_train, y_train, eval_set=[(X_val, y_val)])
acc_train = accuracy_score(xgb1.predict(X_train),y_train)
acc_val = accuracy_score(xgb1.predict(X_val),y_val)
print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")

[0]	validation_0-logloss:0.64016
[1]	validation_0-logloss:0.60217
[2]	validation_0-logloss:0.57582
[3]	validation_0-logloss:0.55368
[4]	validation_0-logloss:0.53220
[5]	validation_0-logloss:0.51516
[6]	validation_0-logloss:0.50148
[7]	validation_0-logloss:0.48897
[8]	validation_0-logloss:0.48093
[9]	validation_0-logloss:0.47246
[10]	validation_0-logloss:0.46495
[11]	validation_0-logloss:0.45897
[12]	validation_0-logloss:0.45399
[13]	validation_0-logloss:0.44730
[14]	validation_0-logloss:0.44614
[15]	validation_0-logloss:0.44210
[16]	validation_0-logloss:0.44075
[17]	validation_0-logloss:0.43711
[18]	validation_0-logloss:0.43493
[19]	validation_0-logloss:0.43322
[20]	validation_0-logloss:0.43173
[21]	validation_0-logloss:0.42950
[22]	validation_0-logloss:0.42604
[23]	validation_0-logloss:0.42546
[24]	validation_0-logloss:0.42461
[25]	validation_0-logloss:0.42260
[26]	validation_0-logloss:0.42156
[27]	validation_0-logloss:0.42289
[28]	validation_0-logloss:0.42412
[29]	validation_0-loglos

In [18]:
param_grid = {
    #"max_depth": [4,5,6],
    #"min_child_weight": [0,1,2],
    #"gamma": [0.4, 0.5, 0.6],
    #"subsample": [0.55, 0.6, 0.65],
    #"colsample_bytree": [0.95, 1.0, 1.05],
    #"reg_alpha":[1e-5, 1e-4, 1e-6]
}
gs = GridSearchCV(
    estimator=XGBClassifier(
        learning_rate =0.1,
        n_estimators=105,
        early_stopping_rounds=50,
        max_depth=6,
        min_child_weight=2,
        gamma=0.5,
        subsample=0.65,
        colsample_bytree=1.0,
        objective= 'binary:logistic',
        nthread=-1,
        scale_pos_weight=1,
        random_state=42
    ),
    param_grid = param_grid, 
    scoring='roc_auc',
    n_jobs=4, 
    cv=5
)
gs.fit(X_train, y_train, eval_set=[(X_val, y_val)])
acc_train = accuracy_score(gs.predict(X_train),y_train)
acc_val = accuracy_score(gs.predict(X_val),y_val)
print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")
print(f"Best params: {gs.best_params_}\n Best score: {gs.best_score_}")

[0]	validation_0-logloss:0.63842
[1]	validation_0-logloss:0.60519
[2]	validation_0-logloss:0.57550
[3]	validation_0-logloss:0.55008
[4]	validation_0-logloss:0.52975
[5]	validation_0-logloss:0.51640
[6]	validation_0-logloss:0.50378
[7]	validation_0-logloss:0.49218
[8]	validation_0-logloss:0.48289
[9]	validation_0-logloss:0.47338
[10]	validation_0-logloss:0.46583
[11]	validation_0-logloss:0.46085
[12]	validation_0-logloss:0.45677
[13]	validation_0-logloss:0.45304
[14]	validation_0-logloss:0.45149
[15]	validation_0-logloss:0.44545
[16]	validation_0-logloss:0.44372
[17]	validation_0-logloss:0.44118
[18]	validation_0-logloss:0.43972
[19]	validation_0-logloss:0.43718
[20]	validation_0-logloss:0.43398
[21]	validation_0-logloss:0.43201
[22]	validation_0-logloss:0.43257
[23]	validation_0-logloss:0.43024
[24]	validation_0-logloss:0.43039
[25]	validation_0-logloss:0.42705
[26]	validation_0-logloss:0.42809
[27]	validation_0-logloss:0.42718
[28]	validation_0-logloss:0.42662
[29]	validation_0-loglos

In [19]:
xgb2 =XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    early_stopping_rounds=50,
    max_depth=6,
    min_child_weight=2,
    gamma=0.5,
    subsample=0.65,
    colsample_bytree=1.0,
    reg_alpha=1e-5,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb2.fit(X_train, y_train, eval_set=[(X_val, y_val)])
acc_train = accuracy_score(xgb2.predict(X_train),y_train)
acc_val = accuracy_score(xgb2.predict(X_val),y_val)
print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")

[0]	validation_0-logloss:0.64016
[1]	validation_0-logloss:0.60217
[2]	validation_0-logloss:0.57582
[3]	validation_0-logloss:0.55368
[4]	validation_0-logloss:0.53220
[5]	validation_0-logloss:0.51516
[6]	validation_0-logloss:0.50148
[7]	validation_0-logloss:0.48897
[8]	validation_0-logloss:0.48093
[9]	validation_0-logloss:0.47246
[10]	validation_0-logloss:0.46495
[11]	validation_0-logloss:0.45897
[12]	validation_0-logloss:0.45399
[13]	validation_0-logloss:0.44730
[14]	validation_0-logloss:0.44614
[15]	validation_0-logloss:0.44210
[16]	validation_0-logloss:0.44075
[17]	validation_0-logloss:0.43711
[18]	validation_0-logloss:0.43493
[19]	validation_0-logloss:0.43322
[20]	validation_0-logloss:0.43173
[21]	validation_0-logloss:0.42950
[22]	validation_0-logloss:0.42604
[23]	validation_0-logloss:0.42546
[24]	validation_0-logloss:0.42461
[25]	validation_0-logloss:0.42260
[26]	validation_0-logloss:0.42156
[27]	validation_0-logloss:0.42289
[28]	validation_0-logloss:0.42412
[29]	validation_0-loglos

In [20]:
xgb3 =XGBClassifier(
    learning_rate =0.01,
    n_estimators=539,
    early_stopping_rounds=50,
    max_depth=6,
    min_child_weight=2,
    gamma=0.5,
    subsample=0.65,
    colsample_bytree=1.0,
    reg_alpha=1e-5,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb3.fit(X_train, y_train, eval_set=[(X_val, y_val)])
acc_train = accuracy_score(xgb3.predict(X_train),y_train)
acc_val = accuracy_score(xgb3.predict(X_val),y_val)
print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")

[0]	validation_0-logloss:0.67632
[1]	validation_0-logloss:0.67156
[2]	validation_0-logloss:0.66742
[3]	validation_0-logloss:0.66315
[4]	validation_0-logloss:0.65875
[5]	validation_0-logloss:0.65453
[6]	validation_0-logloss:0.65078
[7]	validation_0-logloss:0.64680
[8]	validation_0-logloss:0.64314
[9]	validation_0-logloss:0.63935
[10]	validation_0-logloss:0.63535
[11]	validation_0-logloss:0.63191
[12]	validation_0-logloss:0.62847
[13]	validation_0-logloss:0.62521
[14]	validation_0-logloss:0.62196
[15]	validation_0-logloss:0.61844
[16]	validation_0-logloss:0.61510
[17]	validation_0-logloss:0.61189
[18]	validation_0-logloss:0.60920
[19]	validation_0-logloss:0.60611
[20]	validation_0-logloss:0.60306
[21]	validation_0-logloss:0.60029
[22]	validation_0-logloss:0.59766
[23]	validation_0-logloss:0.59453
[24]	validation_0-logloss:0.59184
[25]	validation_0-logloss:0.58939
[26]	validation_0-logloss:0.58653
[27]	validation_0-logloss:0.58382
[28]	validation_0-logloss:0.58122
[29]	validation_0-loglos

# Make prediction

In [21]:
best=xgb3
predictions = best.predict(test_df)
submission_data["Survived"] = predictions
#submission_data["Survived"] = submission_data["Survived"].map({0: "False", 1: "True"})
submission_data.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [22]:
submission_data.to_csv('submission.csv', index=False)