In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# read the data
titanic = pd.read_csv('/kaggle/input/titanic/train.csv')

# get a glimpse over the data
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# reviewing the data, I find out that all the numerical columns 
# are complete, except for the Age column, which is normal, 
# since some people may have sneaked into the boat to immigrate to America
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# since Cabin was an object, it didn't appear in the describe method,
# but now we know it has a lot of null values
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# the Cabin column has 147 different unique values,
# which makes one hot encoding inefficient
titanic['Cabin'].nunique()

147

In [6]:
# the Ticket column just tells you the 
# type of ticket, as "STON/02." may show.
# ordinal encoding may be useful here
titanic['Ticket']

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [7]:
# in the data description of the competition,
# it says there are 3 ports, 
# C = Cherbourg
# Q = Queenstown
# S = Southampton
# this is perfect for one hot encoding
titanic['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [8]:
# I will drop this column, since it's useless
titanic['PassengerId']

0        1
1        2
2        3
3        4
4        5
      ... 
886    887
887    888
888    889
889    890
890    891
Name: PassengerId, Length: 891, dtype: int64

In [9]:
# I will create a new feature for the amount of family members
# family members = siblings + spouses + parents + children
titanic['FamMembers'] = titanic['SibSp'] + titanic['Parch']

In [10]:
# splitting the data into training and validation sets
from sklearn.model_selection import train_test_split

y_train = titanic['Survived']
X_train = titanic.drop('Survived', axis=1)

In [11]:
# I drop PassengerId column
X_train.drop('PassengerId', axis=1, inplace=True)

In [12]:
# separating the data between numerical and categorical columns.
# since the dataset doesn't have too many columns, I will keep all
numerical_cols = X_train.select_dtypes(include=['int64','float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

In [13]:
# we obtain One Hot Encoding, Ordinal Encoding and Target Encoding columns
ohe_cols = [col for col in categorical_cols if X_train[col].nunique() <= 3]
ordinal_cols = [col for col in categorical_cols if 3 < X_train[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() >= 10]

In [14]:
# fill null values with the median in numerical columns
from sklearn.impute import SimpleImputer

num_cols_transformer = SimpleImputer(strategy='median')

In [15]:
# pipelines for categorical columns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline

ohe_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore',
                           sparse_output=False))
])

ordinal_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',
                                unknown_value=-1))
])

high_cardinality_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('target', TargetEncoder())
])

In [16]:
# bundle all the pipelines into a ColumnTransformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_cols_transformer, numerical_cols),
        ('ohe', ohe_cols_transformer, ohe_cols),
        ('ord', ordinal_cols_transformer, ordinal_cols),
        ('hcc', high_cardinality_cols_transformer, high_cardinality_cols)
    ]
)

In [17]:
# prepare the data for pytorch
import torch
from sklearn.preprocessing import StandardScaler

X_train = preprocessor.fit_transform(X_train, y_train)

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

In [18]:
# get the test data
titanic_test = pd.read_csv('/kaggle/input/titanic/test.csv')

titanic_test['FamMembers'] = titanic_test['SibSp'] + titanic_test['Parch']
X_test = titanic_test.drop('PassengerId', axis=1)
X_test = preprocessor.transform(X_test)
X_test_scaled = scaler_X.transform(X_test)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

In [19]:
# apply K Nearest Neighbors

# get the distances between each test point with all training points
distances = torch.cdist(X_test_tensor, X_train_tensor)

# gets the closest points to each test point
_, k_indices = torch.topk(distances, k=3, largest=False)

# gets the label for each of those points
k_nearest_labels = y_train_tensor[k_indices]

# performs majority vote for each test point and its neighbors
predictions = torch.mode(k_nearest_labels, dim=1).values

In [20]:
# submit the predictions
submission = pd.DataFrame({
    'PassengerId': titanic_test['PassengerId'],
    'Survived': predictions.numpy()
})

submission.to_csv('submission.csv', index=False)