In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Read data and show nan column

In [None]:
train = pd.read_csv('titanic/data/train.csv', index_col='PassengerId')
x_test = pd.read_csv('titanic/data/test.csv', index_col='PassengerId')

print(train.describe())
print(train.shape)
print(train.head())

nan_column = train.columns[train.isna().sum() > len(train)/2]
print(nan_column)

         Survived      Pclass         Age       SibSp       Parch        Fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
(891, 11)
             Survived  Pclass  \
PassengerId                     
1                   0       3   
2                   1       1   
3                   1       3   
4                   1       1   
5                   0       3   

                                                        

# Split x_train and y_train

In [134]:
y_train = train['Survived']
x_train = train.drop('Survived' , axis=1)

# clean unrelated data, 'Name'
x_train.drop('Name', axis=1, inplace=True)
x_test.drop('Name', axis=1, inplace=True)

print(y_train.shape)
print(y_train.head())
print(x_train.shape)
print(x_train['Ticket'].head(100))

(891,)
PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64
(891, 9)
PassengerId
1             A/5 21171
2              PC 17599
3      STON/O2. 3101282
4                113803
5                373450
             ...       
96               374910
97             PC 17754
98             PC 17759
99               231919
100              244367
Name: Ticket, Length: 100, dtype: object


# Nan handling
* The column **cabin** has more than half of Nans, so replaced with 'Unknown'
* The column **embarked** has two Nans, so replaced with 'U'
* The column **Age** has 177, so 

In [135]:
x_train['Cabin'] = x_train['Cabin'].fillna('Unknown')
x_train['Embarked'] = x_train['Embarked'].fillna('U')
mean_age = x_train['Age'].mean()
x_train['Age'] = x_train['Age'].fillna(mean_age)
x_train['Ticket'] = x_train['Ticket'].fillna('110287')
x_train['Sex'] = x_train['Sex'].map({'male': 0, 'female': 1})

print(x_train.head(10))

# Do the same for test
# print(test.isna().sum())
x_test['Cabin'] = x_test['Cabin'].fillna('Unknown')
x_test['Embarked'] = x_test['Embarked'].fillna('U')
mean_age_test = x_test['Age'].mean()
x_test['Age'] = x_test['Age'].fillna(mean_age)
x_test['Fare'] = x_test['Fare'].bfill()
x_test['Sex'] = x_test['Sex'].map({'male': 0, 'female': 1})

print(x_test.head(10))



             Pclass  Sex        Age  SibSp  Parch            Ticket     Fare  \
PassengerId                                                                    
1                 3    0  22.000000      1      0         A/5 21171   7.2500   
2                 1    1  38.000000      1      0          PC 17599  71.2833   
3                 3    1  26.000000      0      0  STON/O2. 3101282   7.9250   
4                 1    1  35.000000      1      0            113803  53.1000   
5                 3    0  35.000000      0      0            373450   8.0500   
6                 3    0  29.699118      0      0            330877   8.4583   
7                 1    0  54.000000      0      0             17463  51.8625   
8                 3    0   2.000000      3      1            349909  21.0750   
9                 3    1  27.000000      0      2            347742  11.1333   
10                2    1  14.000000      1      0            237736  30.0708   

               Cabin Embarked  
Passeng

# Feature Encoding


In [136]:
def split_ticket(ticket):
    parts = ticket.split(' ')
    if len(parts) == 1:
        return pd.Series(['None', parts[0]])
    else:
        return pd.Series([' '.join(parts[:-1]), parts[-1]])

le_ticket = LabelEncoder()
le_cabin = LabelEncoder()
le_embarked = LabelEncoder()

x_train[['TicketPrefix', 'TicketNumber']] = x_train['Ticket'].apply(split_ticket)
# Step 2: Convert TicketNumber to numeric (optional, helps models)
x_train['TicketNumber'] = pd.to_numeric(x_train['TicketNumber'], errors='coerce')
x_train['TicketPrefixEncoded'] = le_ticket.fit_transform(x_train['TicketPrefix'])
x_train['CabinEncoded'] = le_cabin.fit_transform(x_train['Cabin'])
x_train['EmbarkedEncoded'] = le_embarked.fit_transform(x_train['Embarked'])

x_train.drop(['TicketPrefix'], axis=1, inplace=True)
x_train.drop(['Ticket'], axis=1, inplace=True)
x_train.drop(['Cabin'], axis=1, inplace=True)
x_train.drop(['Embarked'], axis=1, inplace=True)
print(x_test.head(10))

x_test[['TicketPrefix', 'TicketNumber']] = x_test['Ticket'].apply(split_ticket)
x_test['TicketNumber'] = pd.to_numeric(x_test['TicketNumber'], errors='coerce')
x_test['TicketPrefixEncoded'] = le_ticket.fit_transform(x_test['TicketPrefix'])
x_test['CabinEncoded'] = le_cabin.fit_transform(x_test['Cabin'])
x_test['EmbarkedEncoded'] = le_embarked.fit_transform(x_test['Embarked'])
x_test.drop(['TicketPrefix'], axis=1, inplace=True)
x_test.drop(['Ticket'], axis=1, inplace=True)
x_test.drop(['Cabin'], axis=1, inplace=True)
x_test.drop(['Embarked'], axis=1, inplace=True)

print(x_test.head(10))

             Pclass  Sex   Age  SibSp  Parch     Ticket     Fare    Cabin  \
PassengerId                                                                 
892               3    0  34.5      0      0     330911   7.8292  Unknown   
893               3    1  47.0      1      0     363272   7.0000  Unknown   
894               2    0  62.0      0      0     240276   9.6875  Unknown   
895               3    0  27.0      0      0     315154   8.6625  Unknown   
896               3    1  22.0      1      1    3101298  12.2875  Unknown   
897               3    0  14.0      0      0       7538   9.2250  Unknown   
898               3    1  30.0      0      0     330972   7.6292  Unknown   
899               2    0  26.0      1      1     248738  29.0000  Unknown   
900               3    1  18.0      0      0       2657   7.2292  Unknown   
901               3    0  21.0      2      0  A/4 48871  24.1500  Unknown   

            Embarked  
PassengerId           
892                Q  
893   

# Model Training

In [137]:
model = XGBClassifier(use_label_encoder=False)
model.fit(x_train, y_train)



# Model Predict

In [None]:
y_test = model.predict(x_test)

print("y_train shape: ", y_train.shape)
print(x_test.index)
print(y_test)

file = pd.DataFrame({"PassengerId":x_test.index, "Survived":y_test})
file.to_csv("titanic/data/gender_submission.csv", index=False)

y_train shape:  (891,)
Index([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,
       ...
       1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309],
      dtype='int64', name='PassengerId', length=418)
[0 0 0 1 1 1 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 0 0 1 0 1 1 0
 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 0 1 1 0 1 1
 1 0 0 1 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 1 0 1 1 1 0 1 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1 1 1 1 1 0 1 1 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0 0
 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1 1 0 1 0 0 1 0 1 1 0 1 0 0 0 1 1 0
 1 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1
 0 1 0 0 1 1 0 0 0 1 0 0 0 1