In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [2]:
from sklearn.model_selection import train_test_split
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin', 'Embarked']
X = train_data[features]
y = train_data.Survived
#X = X_new.drop(['Survived'], axis=1)
# Splitting training data into training set and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [5]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
140,3,female,,0,2,,C
439,2,male,31.0,0,0,,S
817,2,male,31.0,1,1,,C
378,3,male,20.0,0,0,,C
491,3,male,21.0,0,0,,S


In [6]:
X_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch
count,712.0,571.0,712.0,712.0
mean,2.317416,29.745184,0.529494,0.391854
std,0.833767,14.619046,1.140842,0.821368
min,1.0,0.67,0.0,0.0
25%,2.0,20.75,0.0,0.0
50%,3.0,29.0,0.0,0.0
75%,3.0,38.0,1.0,0.0
max,3.0,80.0,8.0,6.0


# ***Data Preprocessing***

Separate numeric columns and categorical columns

In [7]:
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in X_train.columns if X_train[col].nunique()<10 and X_train[col].dtype == 'object']
print('Numerical columns: ', numerical_cols)
print('Categorical columns: ', categorical_cols)

Numerical columns:  ['Pclass', 'Age', 'SibSp', 'Parch']
Categorical columns:  ['Sex', 'Embarked']


In [8]:
cols_with_missing_values = [col for col in X_train.columns if X_train[col].isnull().any()]
cols_with_missing_values

['Age', 'Cabin', 'Embarked']

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

Preprocessing for numerical data

In [10]:
numerical_transformer = SimpleImputer(strategy='mean')

Preprocessing for categorical data

In [11]:
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Bundling processing for numerical and categorical data

In [12]:
preprocessor = ColumnTransformer(
                    transformers = [
                        ('num', numerical_transformer, numerical_cols),
                        ('cat', categorical_transformer, categorical_cols)
                    ] 
)

# Modeling and Pipeline

Defining model

In [13]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, early_stopping_rounds=5, eval_set=[X_valid, y_valid], verbose=False)

Evaluation metrics

In [14]:
from sklearn.metrics import mean_absolute_error as mae

Bundling preprocessing and modeling code in a pipeline

In [15]:
my_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

In [16]:
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch']),
                                                 ('cat',
                          

In [17]:
preds = my_pipeline.predict(X_valid)

In [18]:
from sklearn.model_selection import cross_val_score
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
scores.mean()

0.25197209188339664

In [19]:
test_preds = my_pipeline.predict(test_data[features])

In [20]:
test_preds

array([-6.15766048e-02,  4.73776460e-03, -2.23642588e-03,  7.30867505e-01,
        3.67291570e-01,  5.07597148e-01,  7.35512912e-01, -1.00630999e-01,
        5.36870956e-02, -2.81515121e-02,  9.75015163e-02, -7.11792707e-03,
        9.32644784e-01, -1.44295275e-01,  8.21661234e-01,  9.42576885e-01,
       -2.74561644e-02,  7.93380022e-01, -2.08899796e-01,  3.53770345e-01,
        3.82038802e-01,  9.52920318e-01,  9.59196746e-01,  1.29529595e-01,
        7.26814210e-01, -1.90679848e-01,  1.03031075e+00,  2.01347947e-01,
        2.59494543e-01, -1.67577267e-02, -1.81621969e-01, -6.39387965e-02,
        5.37336826e-01,  5.36730289e-02,  1.71718150e-01,  1.22478783e-01,
        1.94009185e-01,  8.44857335e-01,  2.23964959e-01,  9.75015163e-02,
        1.44137830e-01,  9.44772661e-02,  1.88505948e-02,  9.37508166e-01,
        8.99956822e-01,  2.23964959e-01,  3.94805014e-01,  5.81977665e-02,
        9.82964158e-01,  6.57753468e-01,  5.50950468e-02,  5.47980011e-01,
        1.02170932e+00,  

In [21]:
output = []
for pred in test_preds:
    if pred < 0.5:
        output.append(0)
    else:
        output.append(1)

In [22]:
submissions = pd.DataFrame({'PassengerId': test_data['PassengerId'],
                          'Survived': output})

In [23]:
submissions

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [24]:
submissions.to_csv('titanic.csv', index=False)