In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..') # Adds higher directory to python modules path.

from preprocessing.loading import load

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer

# todo check variable trap
# todo preprocess X_all for all columns 

In [2]:
# Loading data

train_df, test_df = load()

X_train = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y_train = train_df[['Survived']]

X_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

X_all = pd.concat([X_train, X_test], axis=0)

X_train.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [3]:
# Sex column has to be encoded

print(X_train['Sex'].value_counts(dropna=False))

def encode(encoder:OneHotEncoder, data:pd.DataFrame, column:str):
    tmp_columns = pd.DataFrame(encoder.transform(data[[column]]), columns=encoder.get_feature_names([column]))
    data = data.drop(columns=column)
    data = pd.concat([data, tmp_columns], axis=1)
    return data

sex_ohe = OneHotEncoder(categories='auto', sparse=False, dtype=bool, drop='first').fit(X_all[['Sex']])
X_train = encode(sex_ohe, X_train, 'Sex')

print(X_train[sex_ohe.get_feature_names(['Sex'])[:]].eq(1).sum(axis=0))
X_train.head(5)

male      577
female    314
Name: Sex, dtype: int64
Sex_male    577
dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,3,22.0,1,0,7.25,S,True
1,1,38.0,1,0,71.2833,C,False
2,3,26.0,0,0,7.925,S,False
3,1,35.0,1,0,53.1,S,False
4,3,35.0,0,0,8.05,S,True


In [4]:
# Checking distribution of numerical values and presence of NaNs

print(X_train['Pclass'].value_counts(dropna=False).head(10))
print(X_train['Age'].value_counts(dropna=False).head(10))
print(X_train['SibSp'].value_counts(dropna=False).head(10))
print(X_train['Parch'].value_counts(dropna=False).head(10))
print(X_train['Fare'].value_counts(dropna=False).head(10))


3    491
1    216
2    184
Name: Pclass, dtype: int64
NaN     177
24.0     30
22.0     27
18.0     26
28.0     25
19.0     25
30.0     25
21.0     24
25.0     23
36.0     22
Name: Age, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
10.5000    24
7.9250     18
7.7750     16
26.5500    15
0.0000     15
Name: Fare, dtype: int64


In [5]:
# We need to get rid of NaNs in Age field

def impute(imputer: SimpleImputer, data: pd.DataFrame, column: str):
    data[[column]] = pd.DataFrame(imputer.transform(data[[column]]), columns=[column])
    return data

age_imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1).fit(X_all[['Age']])
X_train = impute(age_imputer, X_train, 'Age')
X_all = impute(age_imputer, X_all, 'Age')

print(X_train['Age'].value_counts(dropna=False).head(10))

# Age should be a bucket

def bucket(discretizer: KBinsDiscretizer, data: pd.DataFrame, column: str, n_bins: int):    
    data[[column]] = pd.DataFrame(discretizer.transform(data[[column]]), columns=[column])
    return data

age_n_bins = 8
age_discretizer = KBinsDiscretizer(n_bins=age_n_bins, strategy='quantile', encode='ordinal').fit(X_all[['Age']])
X_train = bucket(age_discretizer, X_train, 'Age', age_n_bins)

X_train.head(5)

28.0    202
24.0     30
22.0     27
18.0     26
19.0     25
30.0     25
21.0     24
25.0     23
36.0     22
29.0     20
Name: Age, dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,3,2.0,1,0,7.25,S,True
1,1,6.0,1,0,71.2833,C,False
2,3,2.0,0,0,7.925,S,False
3,1,6.0,1,0,53.1,S,False
4,3,6.0,0,0,8.05,S,True


In [6]:
# Encoding Embarked data, removing NaNs

print(X_train['Embarked'].value_counts(dropna=False).head(10))

embark_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent', verbose=1).fit(X_all[['Embarked']])
X_train = impute(embark_imputer, X_train, 'Embarked')
X_all = impute(embark_imputer, X_all, 'Embarked')

embark_ohe = OneHotEncoder(categories='auto', sparse=False, dtype=int).fit(X_all[['Embarked']])
X_train = encode(embark_ohe, X_train, 'Embarked')

X_train.head(5)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,2.0,1,0,7.25,True,0,0,1
1,1,6.0,1,0,71.2833,False,1,0,0
2,3,2.0,0,0,7.925,False,0,0,1
3,1,6.0,1,0,53.1,False,0,0,1
4,3,6.0,0,0,8.05,True,0,0,1


In [7]:
# Fare should be a bucket

fare_n_bins = 6
fare_imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1).fit(X_all[['Fare']])
X_all = impute(fare_imputer, X_all, 'Fare')
X_train = impute(fare_imputer, X_train, 'Fare')

fare_discretizer = KBinsDiscretizer(n_bins=fare_n_bins, encode='ordinal').fit(X_all[['Fare']])
X_train = bucket(fare_discretizer, X_train, 'Fare', fare_n_bins)

X_train.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,2.0,1,0,0.0,True,0,0,1
1,1,6.0,1,0,5.0,False,1,0,0
2,3,2.0,0,0,1.0,False,0,0,1
3,1,6.0,1,0,5.0,False,0,0,1
4,3,6.0,0,0,1.0,True,0,0,1


In [8]:
# Processing test data

X_test = encode(sex_ohe, X_test, 'Sex')
X_test = impute(age_imputer, X_test, 'Age')
X_test = bucket(age_discretizer, X_test, 'Age', age_n_bins)
X_test = impute(embark_imputer, X_test, 'Embarked')
X_test = encode(embark_ohe, X_test, 'Embarked')
X_test = impute(fare_imputer, X_test, 'Fare')
X_test = bucket(fare_discretizer, X_test, 'Fare', fare_n_bins)

X_test.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,5.0,0,0,1.0,True,0,1,0
1,3,7.0,1,0,0.0,False,0,0,1
2,2,7.0,0,0,2.0,True,0,1,0
3,3,3.0,0,0,2.0,True,0,0,1
4,3,2.0,1,1,2.0,False,0,0,1


In [9]:
print(X_train.isnull().values.any())
print(X_test.isnull().values.any())

False
False


In [10]:
X_train.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,2.0,1,0,0.0,True,0,0,1
1,1,6.0,1,0,5.0,False,1,0,0
2,3,2.0,0,0,1.0,False,0,0,1
3,1,6.0,1,0,5.0,False,0,0,1
4,3,6.0,0,0,1.0,True,0,0,1


In [11]:
# interesting results
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(f1_score(clf.predict(X_train), y_train))

from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier().fit(X_train, y_train)
print(clf_dt.score(X_train, y_train))
print(f1_score(clf_dt.predict(X_train), y_train))

# best score 0.9034792368125701

y_pred_test = clf_dt.predict(X_test)

0.7991021324354658
0.7267175572519083
0.9102132435465768
0.8722044728434505


  y = column_or_1d(y, warn=True)


In [None]:
file = pd.DataFrame({
   'PassengerId':test_df['PassengerId'],
   'Survived': y_pred_test
})
file.to_csv('titanic.csv', index=False)
