In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read the data
titanic = pd.read_csv('/kaggle/input/titanic/train.csv')

# get a glimpse over the data
titanic.head()

In [None]:
# reviewing the data, I find out that all the numerical columns 
# are complete, except for the Age column, which is normal, 
# since some people may have sneaked into the boat to immigrate to America
titanic.describe()

In [None]:
# since Cabin was an object, it didn't appear in the describe method,
# but now we know it has a lot of null values
titanic.info()

In [None]:
# the Cabin column has 147 different unique values,
# which makes one hot encoding inefficient
titanic['Cabin'].nunique()

In [None]:
# the Ticket column just tells you the 
# type of ticket, as "STON/02." may show.
# ordinal encoding may be useful here
titanic['Ticket']

In [None]:
# in the data description of the competition,
# it says there are 3 ports, 
# C = Cherbourg
# Q = Queenstown
# S = Southampton
# this is perfect for one hot encoding
titanic['Embarked']

In [None]:
# I will drop this column, since it's useless
titanic['PassengerId']

In [None]:
# I will create a new feature for the amount of family members
# family members = siblings + spouses + parents + children
titanic['FamMembers'] = titanic['SibSp'] + titanic['Parch']

In [None]:
# splitting the data into training and validation sets
from sklearn.model_selection import train_test_split

y_train = titanic['Survived']
X_train = titanic.drop('Survived', axis=1)

In [None]:
# I drop PassengerId column
X_train.drop('PassengerId', axis=1, inplace=True)

In [None]:
# separating the data between numerical and categorical columns.
# since the dataset doesn't have too many columns, I will keep all
numerical_cols = X_train.select_dtypes(include=['int64','float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

In [None]:
# we obtain One Hot Encoding, Ordinal Encoding and Target Encoding columns
ohe_cols = [col for col in categorical_cols if X_train[col].nunique() <= 3]
ordinal_cols = [col for col in categorical_cols if 3 < X_train[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() >= 10]

In [None]:
# fill null values with the median in numerical columns
from sklearn.impute import SimpleImputer

num_cols_transformer = SimpleImputer(strategy='median')

In [None]:
# pipelines for categorical columns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline

ohe_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore',
                           sparse_output=False))
])

ordinal_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',
                                unknown_value=-1))
])

high_cardinality_cols_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('target', TargetEncoder())
])

In [None]:
# bundle all the pipelines into a ColumnTransformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_cols_transformer, numerical_cols),
        ('ohe', ohe_cols_transformer, ohe_cols),
        ('ord', ordinal_cols_transformer, ordinal_cols),
        ('hcc', high_cardinality_cols_transformer, high_cardinality_cols)
    ]
)

In [None]:
# prepare the data for pytorch
import torch
from sklearn.preprocessing import StandardScaler

X_train = preprocessor.fit_transform(X_train, y_train)

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

In [None]:
# get the test data
titanic_test = pd.read_csv('/kaggle/input/titanic/test.csv')

titanic_test['FamMembers'] = titanic_test['SibSp'] + titanic_test['Parch']
X_test = titanic_test.drop('PassengerId', axis=1)
X_test = preprocessor.transform(X_test)
X_test_scaled = scaler_X.transform(X_test)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

In [None]:
# apply K Nearest Neighbors

# get the distances between each test point with all training points
distances = torch.cdist(X_test_tensor, X_train_tensor)

# gets the closest points to each test point
_, k_indices = torch.topk(distances, k=3, largest=False)

# gets the label for each of those points
k_nearest_labels = y_train_tensor[k_indices]

# performs majority vote for each test point and its neighbors
predictions = torch.mode(k_nearest_labels, dim=1).values

In [None]:
# submit the predictions
submission = pd.DataFrame({
    'PassengerId': titanic_test['PassengerId'],
    'Survived': predictions.numpy()
})

submission.to_csv('submission.csv', index=False)