In [28]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..') # Adds higher directory to python modules path.

from preprocessing.loading import load

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer

# todo check variable trap
# todo preprocess X_all for all columns 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
train_df, test_df = load()

train_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [30]:
# Loading data

train_df, test_df = load()

X_train = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y_train = train_df[['Survived']]

X_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

X_all = pd.concat([X_train, X_test], axis=0)

# NaN values
print(train_df.isnull().sum())
print(test_df.isnull().sum())

# general data info
X_all.info()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 7 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1308 non-null float64
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 81.8+ KB


In [31]:
# Checking distribution of numerical values and presence of NaNs

print(X_train['Pclass'].value_counts(dropna=False).head(10))
print(X_train['Age'].value_counts(dropna=False).head(10))
print(X_train['SibSp'].value_counts(dropna=False).head(10))
print(X_train['Parch'].value_counts(dropna=False).head(10))
print(X_train['Fare'].value_counts(dropna=False).head(10))

3    491
1    216
2    184
Name: Pclass, dtype: int64
NaN     177
24.0     30
22.0     27
18.0     26
28.0     25
19.0     25
30.0     25
21.0     24
25.0     23
36.0     22
Name: Age, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
10.5000    24
7.9250     18
7.7750     16
26.5500    15
0.0000     15
Name: Fare, dtype: int64


In [32]:
# Sex column has to be encoded

print(X_train['Sex'].value_counts(dropna=False))

def encode(encoder:OneHotEncoder, data:pd.DataFrame, column:str):
    tmp_columns = pd.DataFrame(encoder.transform(data[[column]]), columns=encoder.get_feature_names([column]))
    data = data.drop(columns=column)
    data = pd.concat([data, tmp_columns], axis=1)
    return data

sex_ohe = OneHotEncoder(categories='auto', sparse=False, dtype=bool, drop='first').fit(X_all[['Sex']])
X_train = encode(sex_ohe, X_train, 'Sex')

print(X_train[sex_ohe.get_feature_names(['Sex'])[:]].eq(1).sum(axis=0))
X_train.head(5)

male      577
female    314
Name: Sex, dtype: int64
Sex_male    577
dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,3,22.0,1,0,7.25,S,True
1,1,38.0,1,0,71.2833,C,False
2,3,26.0,0,0,7.925,S,False
3,1,35.0,1,0,53.1,S,False
4,3,35.0,0,0,8.05,S,True


In [33]:
# We need to get rid of NaNs in Age field

def impute(imputer: SimpleImputer, data: pd.DataFrame, column: str):
    data[[column]] = pd.DataFrame(imputer.transform(data[[column]]), columns=[column])
    return data

age_imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1).fit(X_all[['Age']])
X_train = impute(age_imputer, X_train, 'Age')
X_all = impute(age_imputer, X_all, 'Age')

print(X_train['Age'].value_counts(dropna=False).head(10))

# Age should be a bucket

def bucket(discretizer: KBinsDiscretizer, data: pd.DataFrame, column: str, n_bins: int):    
    data[[column]] = pd.DataFrame(discretizer.transform(data[[column]]), columns=[column])
    return data

age_n_bins = 8
age_discretizer = KBinsDiscretizer(n_bins=age_n_bins, strategy='quantile', encode='ordinal').fit(X_all[['Age']])
X_train = bucket(age_discretizer, X_train, 'Age', age_n_bins)

X_train.head(5)

28.0    202
24.0     30
22.0     27
18.0     26
19.0     25
30.0     25
21.0     24
25.0     23
36.0     22
29.0     20
Name: Age, dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,3,2.0,1,0,7.25,S,True
1,1,6.0,1,0,71.2833,C,False
2,3,2.0,0,0,7.925,S,False
3,1,6.0,1,0,53.1,S,False
4,3,6.0,0,0,8.05,S,True


In [34]:
# Encoding Embarked data, removing NaNs

print(X_train['Embarked'].value_counts(dropna=False).head(10))

def encode_ordinal(encoder:OrdinalEncoder, data:pd.DataFrame, column:str):
    tmp_column = pd.DataFrame(encoder.transform(data[[column]]), columns=[column])    
    data[[column]] = tmp_column 
    return data

embark_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent', verbose=1).fit(X_all[['Embarked']])
X_train = impute(embark_imputer, X_train, 'Embarked')
X_all = impute(embark_imputer, X_all, 'Embarked')

embark_ohe = OrdinalEncoder(categories='auto').fit(X_all[['Embarked']])
X_train = encode_ordinal(embark_ohe, X_train, 'Embarked')

X_train.head(5)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,3,2.0,1,0,7.25,2.0,True
1,1,6.0,1,0,71.2833,0.0,False
2,3,2.0,0,0,7.925,2.0,False
3,1,6.0,1,0,53.1,2.0,False
4,3,6.0,0,0,8.05,2.0,True


In [35]:
# Fare should be a bucket

fare_n_bins = 6
fare_imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1).fit(X_all[['Fare']])
X_all = impute(fare_imputer, X_all, 'Fare')
X_train = impute(fare_imputer, X_train, 'Fare')

fare_discretizer = KBinsDiscretizer(n_bins=fare_n_bins, encode='ordinal').fit(X_all[['Fare']])
X_train = bucket(fare_discretizer, X_train, 'Fare', fare_n_bins)

X_train.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,3,2.0,1,0,0.0,2.0,True
1,1,6.0,1,0,5.0,0.0,False
2,3,2.0,0,0,1.0,2.0,False
3,1,6.0,1,0,5.0,2.0,False
4,3,6.0,0,0,1.0,2.0,True


In [None]:
# X_train['Family'] = X_train['SibSp'] + X_train['Parch']
# X_train = X_train.drop(columns=['SibSp', 'Parch'])
# 
# X_train.head(5)

In [36]:
# Processing test data

X_test = encode(sex_ohe, X_test, 'Sex')
X_test = impute(age_imputer, X_test, 'Age')
X_test = bucket(age_discretizer, X_test, 'Age', age_n_bins)
X_test = impute(embark_imputer, X_test, 'Embarked')
X_test = encode_ordinal(embark_ohe, X_test, 'Embarked')
X_test = impute(fare_imputer, X_test, 'Fare')
X_test = bucket(fare_discretizer, X_test, 'Fare', fare_n_bins)

# X_test['Family'] = X_test['SibSp'] + X_test['Parch']
# X_test = X_test.drop(columns=['SibSp', 'Parch'])

X_test.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_male
0,3,5.0,0,0,1.0,1.0,True
1,3,7.0,1,0,0.0,2.0,False
2,2,7.0,0,0,2.0,1.0,True
3,3,3.0,0,0,2.0,2.0,True
4,3,2.0,1,1,2.0,2.0,False


In [37]:
print(X_train.isnull().values.any())
print(X_test.isnull().values.any())

False
False


In [None]:
# No changes after scaling
# from sklearn.preprocessing import StandardScaler
# ss = StandardScaler()
# ss.fit(X_train)
# X_train = pd.DataFrame(ss.transform(X_train), columns = X_train.columns)
# X_test = pd.DataFrame(ss.transform(X_test), columns = X_test.columns)

X_train.head(5)

In [40]:
# interesting results
from sklearn.metrics import f1_score

from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier().fit(X_train, y_train)
print(clf_dt.score(X_train, y_train))
print(f1_score(clf_dt.predict(X_train), y_train))

correlation = pd.DataFrame()
correlation['feature'] = X_train.columns.values.tolist()
correlation['correlation'] = clf_dt.feature_importances_
correlation.sort_values(by=['correlation'], ascending=False, inplace=True)
print(correlation)

# best submission - 0.78947
from sklearn.svm import SVC
clf_svc = SVC().fit(X_train, y_train)
print(clf_svc.score(X_train, y_train))
print(f1_score(clf_svc.predict(X_train), y_train))

y_pred_test = clf_svc.predict(X_test)

0.9102132435465768
0.8722044728434505
    feature  correlation
6  Sex_male     0.396668
0    Pclass     0.148266
1       Age     0.143317
4      Fare     0.111246
2     SibSp     0.078916
3     Parch     0.078602
5  Embarked     0.042986
0.8383838383838383
0.7763975155279503


  y = column_or_1d(y, warn=True)


In [39]:
from sklearn.model_selection import GridSearchCV

param_grid = [  
  {'C' : [0.1, 1, 10, 100],    
   'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],   
   'gamma': ['scale', 'auto'],   
   }
 ]

clf = GridSearchCV(SVC(), param_grid, cv=2, n_jobs=-1, return_train_score=True)
clf.fit(X_train, y_train)
print(clf.best_estimator_)
print()
print(clf.best_score_)
print()
print(clf.best_params_)

  y = column_or_1d(y, warn=True)


SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

0.7991021324354658

{'C': 100, 'gamma': 'scale', 'kernel': 'poly'}


In [None]:
file = pd.DataFrame({
   'PassengerId':test_df['PassengerId'],
   'Survived': y_pred_test
})
file.to_csv('titanic.csv', index=False)
