In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..') # Adds higher directory to python modules path.

from preprocessing.loading import load

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer

# todo check variable trap
# todo preprocess X_all for all columns 

In [36]:
# Loading data

train_df, test_df = load()

X_train = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
y_train = train_df[['Survived']]

X_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

X_all = pd.concat([X_train, X_test], axis=0)

X_train.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,,S
3,1,female,35.0,1,0,53.1,C123,S
4,3,male,35.0,0,0,8.05,,S


In [37]:
# Sex column has to be encoded

print(X_train['Sex'].value_counts(dropna=False))

def encode(encoder:OneHotEncoder, data:pd.DataFrame, column:str):
    tmp_columns = pd.DataFrame(encoder.transform(data[[column]]), columns=encoder.get_feature_names([column]))
    data = data.drop(columns=column)
    data = pd.concat([data, tmp_columns], axis=1)
    return data

sex_ohe = OneHotEncoder(categories='auto', sparse=False, dtype=bool).fit(X_all[['Sex']])
X_train = encode(sex_ohe, X_train, 'Sex')

print(X_train[sex_ohe.get_feature_names(['Sex'])[:]].eq(1).sum(axis=0))
X_train.head(5)

male      577
female    314
Name: Sex, dtype: int64
Sex_female    314
Sex_male      577
dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Sex_female,Sex_male
0,3,22.0,1,0,7.25,,S,False,True
1,1,38.0,1,0,71.2833,C85,C,True,False
2,3,26.0,0,0,7.925,,S,True,False
3,1,35.0,1,0,53.1,C123,S,True,False
4,3,35.0,0,0,8.05,,S,False,True


In [38]:
# Checking distribution of numerical values and presence of NaNs

print(X_train['Pclass'].value_counts(dropna=False).head(10))
print(X_train['Age'].value_counts(dropna=False).head(10))
print(X_train['SibSp'].value_counts(dropna=False).head(10))
print(X_train['Parch'].value_counts(dropna=False).head(10))
print(X_train['Fare'].value_counts(dropna=False).head(10))

# We need to get rid of NaNs in Age field

def impute(imputer: SimpleImputer, data: pd.DataFrame, column: str):
    return pd.DataFrame(imputer.transform(data), columns=[column])

age_imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1).fit(X_all[['Age']])
X_train['Age'] = impute(age_imputer, X_train[['Age']], 'Age')

print(X_train['Age'].value_counts(dropna=False).head(10))
X_train.head(5)

3    491
1    216
2    184
Name: Pclass, dtype: int64
NaN     177
24.0     30
22.0     27
18.0     26
28.0     25
19.0     25
30.0     25
21.0     24
25.0     23
36.0     22
Name: Age, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
10.5000    24
7.9250     18
7.7750     16
26.5500    15
0.0000     15
Name: Fare, dtype: int64
28.0    202
24.0     30
22.0     27
18.0     26
19.0     25
30.0     25
21.0     24
25.0     23
36.0     22
29.0     20
Name: Age, dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Sex_female,Sex_male
0,3,22.0,1,0,7.25,,S,False,True
1,1,38.0,1,0,71.2833,C85,C,True,False
2,3,26.0,0,0,7.925,,S,True,False
3,1,35.0,1,0,53.1,C123,S,True,False
4,3,35.0,0,0,8.05,,S,False,True


In [39]:
# In Cabin section we have NaNs also

print(X_train['Cabin'].value_counts(dropna=False).head(10))

cabin_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', verbose=1, fill_value='Missing-cabin').fit(X_all[['Cabin']])
X_train['Cabin'] = impute(cabin_imputer, X_train[['Cabin']], 'Cabin')

X_train.head(5)

NaN            687
C23 C25 C27      4
B96 B98          4
G6               4
C22 C26          3
E101             3
F2               3
F33              3
D                3
C124             2
Name: Cabin, dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Sex_female,Sex_male
0,3,22.0,1,0,7.25,Missing-cabin,S,False,True
1,1,38.0,1,0,71.2833,C85,C,True,False
2,3,26.0,0,0,7.925,Missing-cabin,S,True,False
3,1,35.0,1,0,53.1,C123,S,True,False
4,3,35.0,0,0,8.05,Missing-cabin,S,False,True


In [40]:
# Encoding Embarked data, removing NaNs

print(X_train['Embarked'].value_counts(dropna=False).head(10))

embark_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent', verbose=1).fit(X_all[['Embarked']])
X_train['Embarked'] = impute(embark_imputer, X_train[['Embarked']], 'Embarked')
X_all['Embarked'] = impute(embark_imputer, X_all[['Embarked']], 'Embarked')

embark_ohe = OneHotEncoder(categories='auto', sparse=False, dtype=int).fit(X_all[['Embarked']])
X_train = encode(embark_ohe, X_train, 'Embarked')

X_train.head(5)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,Missing-cabin,False,True,0,0,1
1,1,38.0,1,0,71.2833,C85,True,False,1,0,0
2,3,26.0,0,0,7.925,Missing-cabin,True,False,0,0,1
3,1,35.0,1,0,53.1,C123,True,False,0,0,1
4,3,35.0,0,0,8.05,Missing-cabin,False,True,0,0,1


In [41]:
# Processing test data

X_test = encode(sex_ohe, X_test, 'Sex')
X_test['Age'] = impute(age_imputer, X_train[['Age']], 'Age')
X_test['Embarked'] = impute(embark_imputer, X_test[['Embarked']], 'Embarked')
X_test = encode(embark_ohe, X_test, 'Embarked')
X_test['Cabin'] = impute(cabin_imputer, X_train[['Cabin']], 'Cabin')

fare_imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1).fit(X_all[['Fare']])
X_test['Fare'] = impute(age_imputer, X_train[['Fare']], 'Fare')


In [42]:
X_test['Fare'].isnull().values.any()

False

In [48]:
# interesting results
# X_train = X_train.drop(columns='Cabin')
# X_test = X_test.drop(columns='Cabin')

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
print(clf.score(X_train, y_train))

from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier().fit(X_train, y_train)
print(clf_dt.score(X_train, y_train))

y_pred_test = clf_dt.predict(X_test)

0.8024691358024691
0.9797979797979798


  y = column_or_1d(y, warn=True)


In [53]:
file = pd.DataFrame({
   'PassengerId':test_df['PassengerId'],
   'Survived': y_pred_test
})
file.to_csv('titanic.csv', index=False)
