## Titanic classifier

In [0]:
import numpy as np
import pandas as pd
import os

In [0]:
from google.colab import drive
from joblib import dump, load

drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


### Load Data

In [0]:
!mkdir -p ~/.kaggle
!cp gdrive/My\ Drive/kaggle/kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
!pip install kaggle



In [0]:
path = os.path.join('gdrive/"My Drive"/titanic-classifier', 'data', 'raw')
train_file, test_file = "train.csv", "test.csv"
path_train, path_test = os.path.join(path, train_file), os.path.join(path, test_file)

!cp {path_train} ~/../content/{train_file}
!cp {path_test} ~/../content/{test_file}

# dataset has been downloaded beforehand
# kaggle competitions download -c titanic -f {train_file} -p {path}
# kaggle competitions download -c titanic -f {test_file} -p {path}

### Analyze Data

In [0]:
train, test = pd.read_csv('train.csv'), pd.read_csv('test.csv')

In [0]:
x_train, y_train, x_test = train.drop('Survived', axis=1), train['Survived'], test

In [0]:
x_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Which features have missing values?

In [0]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [0]:
x_train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


### Categorical/numerical features

In [0]:
x_train['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

*Pclass* is categorical

In [0]:
x_train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

*SibSp* is numerical and in $\mathbb{N}$

In [0]:
x_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

| Categorical Features | Numerical Features| Other |
| ------------- | ------------- | ------------- |
| Pclass | PassengerId | Name (Nominal) |
| Sex | Age | 
| Embarked | SibSp |
|  | Parch |
|  | Fare |





### Feature Engineering

In [0]:
def fe(train, test):
  ''' Adds Columns RelativesOnBoard, NameLen and AgeBucket '''
  train['RelativesOnBoard'] = train['Parch'] + train['SibSp']
  test['RelativesOnBoard'] = test['Parch'] + test['SibSp']

  train['NameLen'] = train['Name'].astype(str).map(len)
  test['NameLen'] = test['Name'].astype(str).map(len)

  #train["AgeBucket"] = train["Age"] // 15 * 15
  #test["AgeBucket"] = test["Age"] // 15 * 15
  
  return train, test

In [0]:
x_train_fe, x_test_fe = fe(x_train, x_test)

### Preprocessing

In [0]:
x_train_fe.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'RelativesOnBoard', 'NameLen'],
      dtype='object')

In [0]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# dropped: SibSp
numeric_features = ['Fare', 'Age', 'RelativesOnBoard', 'NameLen']
cat_features = ['Embarked', 'Sex', 'Pclass']

numeric_imp = Pipeline([('imputer', SimpleImputer(strategy='median')),
                              ('scaler', StandardScaler())])

cat_imp = Pipeline([('categorical', SimpleImputer(strategy='most_frequent')), 
                           ('one_hot', OneHotEncoder())])

preprocessor = ColumnTransformer(remainder='drop',
                                 transformers=[('numeric',
                                                numeric_imp,
                                                numeric_features),
                                               ('categorical',
                                                cat_imp,
                                                cat_features)])

x_train_tf = pd.DataFrame(preprocessor.fit_transform(x_train_fe))
x_test_tf = pd.DataFrame(preprocessor.transform(x_test_fe))

### Training

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train_tf, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [0]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(clf, x_train_tf, y_train, cv=10)
forest_scores.mean()

0.7431086142322098

### Predicting

In [0]:
def tosubmission():
  y_test = pd.DataFrame(clf.predict(x_test_tf))
  y_test['PassengerId'] = test[['PassengerId']]
  y_test.columns = ['Survived', 'PassengerId']
  y_test = y_test.reindex(columns=['PassengerId', 'Survived'])
  y_test = y_test.astype(int)
  y_test.to_csv('submission.csv', encoding='utf-8', index=False)

In [0]:
tosubmission()