# Titanic

<table align="left">
  <td>
    <a href="#" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
  <td>
    <a target="_blank" href="#"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" /></a>
  </td>
</table>

In [45]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, \
                          recall_score, f1_score

## Downloading data

In [4]:
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [7]:
data_path = 'data'
data_folder = os.path.join(data_path, 'titanic.zip')

if not(os.path.exists(data_path)):
    ! mkdir data
    ! cd data && kaggle competitions download -c titanic
    file = zipfile.ZipFile(data_folder)
    file.extractall(data_path)
    ! rm {data_folder}

## Looking for data

In [8]:
def load_data(path):
    file_path = os.path.join('data', path)
    return pd.read_csv(file_path)

start_train = load_data('train.csv')
start_test = load_data('test.csv')
submission_data = load_data('gender_submission.csv')

In [9]:
start_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
start_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
submission_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [12]:
start_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
start_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [14]:
len(start_train['Ticket'].values)

891

In [15]:
len(set(start_train['Cabin'].values))

148

In [16]:
set(start_train['Embarked'].values)

{'C', 'Q', 'S', nan}

## Preparing data

In [17]:
X_train = start_train.drop(['Name', 'Ticket', 'Survived', 'PassengerId', 'Cabin'], axis=1)
y_train = start_train['Survived']

In [18]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [23]:
obj_columns = ['Sex', 'Embarked']
num_columns = list(X_train.drop(obj_columns, axis=1).columns)

In [24]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

obj_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

In [25]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_columns),
    ('obj', obj_pipeline, obj_columns)
])

X_train_prepared = full_pipeline.fit_transform(X_train)

In [26]:
X_train_prepared.shape

(891, 10)

## Training data

In [27]:
some_label = y_train.iloc[:5]
some_data_prepared = X_train_prepared[:5]

In [28]:
kng_clf = KNeighborsClassifier()

kng_clf.fit(X_train_prepared, y_train)

KNeighborsClassifier()

In [29]:
print('Predictions: ', kng_clf.predict(some_data_prepared))
print('Labels: ', list(some_label))

Predictions:  [0 1 1 1 0]
Labels:  [0, 1, 1, 1, 0]


In [39]:
y_train_predictions = cross_val_predict(kng_clf, X_train_prepared, y_train, cv=3)

In [40]:
conf_matrix = confusion_matrix(y_train, y_train_predictions)
conf_matrix

array([[464,  85],
       [104, 238]])

In [42]:
precision_score(y_train, y_train_predictions, average='weighted')

0.7861715749039692

In [43]:
recall_score(y_train, y_train_predictions, average='weighted')

0.7878787878787878

In [44]:
f1_score(y_train, y_train_predictions, average='weighted')

0.7866525596158541

In [46]:
grid_params = [
    {'n_neighbors': [4, 10, 20, 100], 'weights': ['uniform', 'distance']}
]

grid_search = GridSearchCV(kng_clf, grid_params, cv=3, 
                          scoring='f1_weighted')
grid_search.fit(X_train_prepared, y_train)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid=[{'n_neighbors': [4, 10, 20, 100],
                          'weights': ['uniform', 'distance']}],
             scoring='f1_weighted')

In [47]:
grid_search.best_estimator_

KNeighborsClassifier(n_neighbors=20)

In [48]:
grid_search.best_score_

0.8107903214752342

## Evaluate model

In [51]:
test_data = start_test.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis=1)

In [53]:
test_data_prepared = full_pipeline.fit_transform(test_data)
result_predictions = kng_clf.predict(test_data_prepared)

In [81]:
result = pd.DataFrame(np.array([start_test['PassengerId'].values.tolist(), result_predictions]).T, columns=["PassengerId", "Survived"])
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,1
414,1306,1
415,1307,0
416,1308,1


In [83]:
result.to_csv('result.csv', index=False)