<a href="https://www.kaggle.com/code/mariushinsberger/titanic-first-approaches?scriptVersionId=160495362" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Imports

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Load Data

In [3]:
path = str("/kaggle/input/titanic")
train_data = pd.read_csv(f"{path}/train.csv")
test_data = pd.read_csv(f"{path}/test.csv")
submission_data = pd.read_csv(f"{path}/gender_submission.csv")

# Process data

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_data.shape

(891, 12)

In [6]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [10]:
def clean_data(train, test):
    for df in [train, test]:    
        # Rename columns.
        df.rename(columns={"PassengerId": "passenger_id", "Survived": "survived", "Pclass": "p_class", "Name": "name", "Sex": "sex", "Age": "age", "SibSp": "sib_sp",
           "Parch": "parch", "Ticket": "ticket", "Fare": "fare", "Cabin": "cabin", "Embarked": "embarked"}, inplace=True)
        # Drop unneeded columns.
        df.drop(columns=["name", "ticket", "cabin", "passenger_id"], inplace=True)
        # Add feature median to missing categorical values.
        df["embarked"] = df["embarked"].astype(str)
    
    embarked_mode = train["embarked"].mode()
    age_median = train["age"].median()
    fare_median = train["fare"].median()
    
    for df in [train, test]:
        df["embarked"].fillna(embarked_mode, inplace=True)
        df["age"].fillna(age_median, inplace=True)
        df["fare"].fillna(fare_median, inplace=True)
        # Change datatype of numerical values to integer.
        df["age"] = df["age"].astype(int)
        df["parch"] = df["parch"].astype(int)
        df["fare"] = df["fare"].astype(int)
    
    # Transform categorical values.
    train["p_class"] = le.fit_transform(train["p_class"].values)
    test["p_class"] = le.transform(test["p_class"].values)
    
    train["sex"] = le.fit_transform(train["sex"].values)
    test["sex"] = le.transform(test["sex"].values)
    
    train["sib_sp"] = le.fit_transform(train["sib_sp"].values)
    test["sib_sp"] = le.transform(test["sib_sp"].values)
    
    #train["parch"] = le.fit_transform(train["parch"].values)
    #test["parch"] = le.transform(test["parch"].values)
    
    train["embarked"] = le.fit_transform(train["embarked"].values)
    test["embarked"] = le.transform(test["embarked"].values)
    
    #train["fare"] = le.fit_transform(train["fare"].values)
    #test["fare"] = le.transform(test["fare"].values)
    

    # return clean dataframe.
    return train, test

In [11]:
le = LabelEncoder()
train_df, test_df = clean_data(train_data, test_data)

In [12]:
train_data.isna().sum()

survived    0
p_class     0
sex         0
age         0
sib_sp      0
parch       0
fare        0
embarked    0
dtype: int64

In [13]:
test_data.isna().sum()

p_class     0
sex         0
age         0
sib_sp      0
parch       0
fare        0
embarked    0
dtype: int64

In [14]:
train_data.head()

Unnamed: 0,survived,p_class,sex,age,sib_sp,parch,fare,embarked
0,0,2,1,22,1,0,7,2
1,1,0,0,38,1,0,71,0
2,1,2,0,26,0,0,7,2
3,1,0,0,35,1,0,53,2
4,0,2,1,35,0,0,8,2


In [15]:
train_df.corrwith(train_df["survived"]).abs().sort_values(ascending=False)

survived    1.000000
sex         0.543351
p_class     0.338481
fare        0.257482
embarked    0.163517
parch       0.081629
age         0.064909
sib_sp      0.026385
dtype: float64

# Build model

In [16]:
X_train = train_df.drop(columns=["survived"])
y_train = train_df["survived"]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [17]:
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    early_stopping_rounds=50,
    max_depth=6,
    min_child_weight=0,
    gamma=0.45,
    subsample=1.0,
    colsample_bytree=0.75,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb1.fit(X_train, y_train, eval_set=[(X_val, y_val)])
acc_train = accuracy_score(xgb1.predict(X_train),y_train)
acc_val = accuracy_score(xgb1.predict(X_val),y_val)
print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")

[0]	validation_0-logloss:0.63546
[1]	validation_0-logloss:0.60291
[2]	validation_0-logloss:0.57471
[3]	validation_0-logloss:0.56168
[4]	validation_0-logloss:0.54778
[5]	validation_0-logloss:0.53945
[6]	validation_0-logloss:0.52324
[7]	validation_0-logloss:0.51639
[8]	validation_0-logloss:0.50837
[9]	validation_0-logloss:0.50170
[10]	validation_0-logloss:0.49576
[11]	validation_0-logloss:0.48527
[12]	validation_0-logloss:0.47432
[13]	validation_0-logloss:0.46391
[14]	validation_0-logloss:0.46045
[15]	validation_0-logloss:0.45786
[16]	validation_0-logloss:0.44506
[17]	validation_0-logloss:0.43957
[18]	validation_0-logloss:0.43780
[19]	validation_0-logloss:0.43389
[20]	validation_0-logloss:0.43026
[21]	validation_0-logloss:0.42633
[22]	validation_0-logloss:0.42346
[23]	validation_0-logloss:0.42073
[24]	validation_0-logloss:0.41784
[25]	validation_0-logloss:0.41639
[26]	validation_0-logloss:0.41459
[27]	validation_0-logloss:0.41182
[28]	validation_0-logloss:0.41042
[29]	validation_0-loglos

In [18]:
param_grid = {
    #"max_depth": [3,4,5,6,7,8,9],
    #"min_child_weight": [0,1,2],
    #"gamma": [0.35, 0.4, 0.45],
    #"subsample": [0.95, 1.0],
    #"colsample_bytree": [0.75, 0.8, 0.85],
    #"reg_alpha":[1e-5,1e-4,1e-6]
}
gs = GridSearchCV(
    estimator=XGBClassifier(
        learning_rate =0.1,
        n_estimators=97,
        early_stopping_rounds=50,
        max_depth=6,
        min_child_weight=0,
        gamma=0.45,
        subsample=1.0,
        colsample_bytree=0.75,
        objective= 'binary:logistic',
        nthread=-1,
        scale_pos_weight=1,
        random_state=42
    ),
    param_grid = param_grid, 
    scoring='roc_auc',
    n_jobs=4, 
    cv=5
)
#gs.fit(X_train, y_train, eval_set=[(X_val, y_val)])
#acc_train = accuracy_score(gs.predict(X_train),y_train)
#acc_val = accuracy_score(gs.predict(X_val),y_val)
#print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")
#print(f"Best params: {gs.best_params_}\n Best score: {gs.best_score_}")

In [19]:
xgb2 =XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    early_stopping_rounds=50,
    max_depth=6,
    min_child_weight=0,
    gamma=0.45,
    subsample=1.0,
    colsample_bytree=0.75,
    reg_alpha=1e-5,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb2.fit(X_train, y_train, eval_set=[(X_val, y_val)])
acc_train = accuracy_score(xgb2.predict(X_train),y_train)
acc_val = accuracy_score(xgb2.predict(X_val),y_val)
print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")

[0]	validation_0-logloss:0.63546
[1]	validation_0-logloss:0.60291
[2]	validation_0-logloss:0.57471
[3]	validation_0-logloss:0.56168
[4]	validation_0-logloss:0.54778
[5]	validation_0-logloss:0.53945
[6]	validation_0-logloss:0.52324
[7]	validation_0-logloss:0.51639
[8]	validation_0-logloss:0.50837
[9]	validation_0-logloss:0.50170
[10]	validation_0-logloss:0.49576
[11]	validation_0-logloss:0.48527
[12]	validation_0-logloss:0.47432
[13]	validation_0-logloss:0.46391
[14]	validation_0-logloss:0.46045
[15]	validation_0-logloss:0.45786
[16]	validation_0-logloss:0.44506
[17]	validation_0-logloss:0.43957
[18]	validation_0-logloss:0.43780
[19]	validation_0-logloss:0.43389
[20]	validation_0-logloss:0.43026
[21]	validation_0-logloss:0.42633
[22]	validation_0-logloss:0.42346
[23]	validation_0-logloss:0.42073
[24]	validation_0-logloss:0.41784
[25]	validation_0-logloss:0.41639
[26]	validation_0-logloss:0.41459
[27]	validation_0-logloss:0.41182
[28]	validation_0-logloss:0.41042
[29]	validation_0-loglos

In [20]:
xgb3 =XGBClassifier(
    learning_rate =0.01,
    n_estimators=418,
    early_stopping_rounds=50,
    max_depth=6,
    min_child_weight=0,
    gamma=0.45,
    subsample=1.0,
    colsample_bytree=0.75,
    reg_alpha=1e-5,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb3.fit(X_train, y_train, eval_set=[(X_val, y_val)])
acc_train = accuracy_score(xgb3.predict(X_train),y_train)
acc_val = accuracy_score(xgb3.predict(X_val),y_val)
print(f"model: xgb1\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")

[0]	validation_0-logloss:0.67585
[1]	validation_0-logloss:0.67170
[2]	validation_0-logloss:0.66740
[3]	validation_0-logloss:0.66525
[4]	validation_0-logloss:0.66280
[5]	validation_0-logloss:0.66089
[6]	validation_0-logloss:0.65709
[7]	validation_0-logloss:0.65478
[8]	validation_0-logloss:0.65247
[9]	validation_0-logloss:0.65020
[10]	validation_0-logloss:0.64796
[11]	validation_0-logloss:0.64489
[12]	validation_0-logloss:0.64108
[13]	validation_0-logloss:0.63710
[14]	validation_0-logloss:0.63499
[15]	validation_0-logloss:0.63322
[16]	validation_0-logloss:0.62921
[17]	validation_0-logloss:0.62641
[18]	validation_0-logloss:0.62468
[19]	validation_0-logloss:0.62153
[20]	validation_0-logloss:0.61845
[21]	validation_0-logloss:0.61543
[22]	validation_0-logloss:0.61248
[23]	validation_0-logloss:0.60914
[24]	validation_0-logloss:0.60736
[25]	validation_0-logloss:0.60401
[26]	validation_0-logloss:0.60087
[27]	validation_0-logloss:0.59766
[28]	validation_0-logloss:0.59436
[29]	validation_0-loglos

# Make prediction

In [21]:
best=xgb3
predictions = best.predict(test_df)
submission_data["Survived"] = predictions
#submission_data["Survived"] = submission_data["Survived"].map({0: "False", 1: "True"})
submission_data.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [22]:
submission_data.to_csv('submission.csv', index=False)