In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

import joblib

import preprocessors as pp
import config

In [2]:
# read training data
data = pd.read_csv(config.TRAINING_DATA_FILE)
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,title
0,1,1,female,29.0,0,0,211.3375,B5,S,Miss
1,1,1,male,0.9167,1,2,151.55,C22,S,Master
2,1,0,female,2.0,1,2,151.55,C22,S,Miss
3,1,0,male,30.0,1,2,151.55,C22,S,Mr
4,1,0,female,25.0,1,2,151.55,C22,S,Mrs


In [3]:
# divide train and test
X_train, X_test, y_train, y_test = train_test_split(
    data[config.NUMERICAL_VARS+config.CATEGORICAL_VARS],
    data[config.TARGET],
    test_size=0.2,
    random_state=0) 

In [4]:
transformer1 = pp.ExtractFirstLetter(variables=config.CABIN).fit(X_train)
X_train = transformer1.transform(X_train)

X_train.head()

Unnamed: 0,age,fare,sex,cabin,embarked,title
1118,25.0,7.925,male,,S,Mr
44,41.0,134.5,female,E,C,Miss
1072,,7.7333,male,,Q,Mr
1130,18.0,7.775,female,,S,Miss
574,29.0,21.0,male,,S,Mr


In [5]:
transformer2 = pp.MissingIndicator(variables=config.NUMERICAL_VARS).fit(X_train)
X_train = transformer2.transform(X_train)

X_train.head()

Unnamed: 0,age,fare,sex,cabin,embarked,title,age_NA,fare_NA
1118,25.0,7.925,male,,S,Mr,0,0
44,41.0,134.5,female,E,C,Miss,0,0
1072,,7.7333,male,,Q,Mr,1,0
1130,18.0,7.775,female,,S,Miss,0,0
574,29.0,21.0,male,,S,Mr,0,0


In [6]:
transformer3 = pp.CategoricalImputer(variables=config.CATEGORICAL_VARS).fit(X_train)
X_train = transformer3.transform(X_train)

X_train.head()

Unnamed: 0,age,fare,sex,cabin,embarked,title,age_NA,fare_NA
1118,25.0,7.925,male,Missing,S,Mr,0,0
44,41.0,134.5,female,E,C,Miss,0,0
1072,,7.7333,male,Missing,Q,Mr,1,0
1130,18.0,7.775,female,Missing,S,Miss,0,0
574,29.0,21.0,male,Missing,S,Mr,0,0


In [7]:
transformer4 = pp.NumericalImputer(variables=config.NUMERICAL_VARS).fit(X_train)
X_train = transformer4.transform(X_train)

X_train.head()

Unnamed: 0,age,fare,sex,cabin,embarked,title,age_NA,fare_NA
1118,25.0,7.925,male,Missing,S,Mr,0,0
44,41.0,134.5,female,E,C,Miss,0,0
1072,24.0,7.7333,male,Missing,Q,Mr,1,0
1130,18.0,7.775,female,Missing,S,Miss,0,0
574,29.0,21.0,male,Missing,S,Mr,0,0


In [8]:
transformer5 = pp.RareLabelCategoricalEncoder(tol=0.01, variables=config.CATEGORICAL_VARS).fit(X_train)
X_train = transformer5.transform(X_train)

X_train.head()

Unnamed: 0,age,fare,sex,cabin,embarked,title,age_NA,fare_NA
1118,25.0,7.925,male,Missing,S,Mr,0,0
44,41.0,134.5,female,E,C,Miss,0,0
1072,24.0,7.7333,male,Missing,Q,Mr,1,0
1130,18.0,7.775,female,Missing,S,Miss,0,0
574,29.0,21.0,male,Missing,S,Mr,0,0


In [9]:
transformer6 = pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS).fit(X_train, y=y_train)
X_train = transformer6.transform(X_train)

X_train.head()

Unnamed: 0,age,fare,sex,cabin,embarked,title,age_NA,fare_NA
1118,25.0,7.925,0,0,0,0,0,0
44,41.0,134.5,1,6,2,3,0,0
1072,24.0,7.7333,0,0,1,0,1,0
1130,18.0,7.775,1,0,0,3,0,0
574,29.0,21.0,0,0,0,0,0,0


In [10]:
transformer7 = StandardScaler().fit(X_train)
X_train = transformer7.transform(X_train)

X_train

array([[-0.3026105 , -0.5046289 , -0.76104239, ..., -0.82090925,
        -0.49492069, -0.03091962],
       [ 0.95137259,  1.97157102,  1.31398725, ...,  0.99619854,
        -0.49492069, -0.03091962],
       [-0.38098444, -0.50837914, -0.76104239, ..., -0.82090925,
         2.02052574, -0.03091962],
       ...,
       [-0.38098444, -0.50837914,  1.31398725, ...,  0.99619854,
         2.02052574, -0.03091962],
       [-0.69448022,  0.05927758,  1.31398725, ...,  0.99619854,
        -0.49492069, -0.03091962],
       [ 0.2460071 , -0.35643838,  1.31398725, ...,  1.60190114,
        -0.49492069, -0.03091962]])

In [11]:
model = LogisticRegression(C=1/0.0005, random_state=0)

model.fit(X_train, y_train)

LogisticRegression(C=2000.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
class_ = model.predict(X_train)
print('train accuracy: {}'.format(accuracy_score(y_train, class_)))
print()

train accuracy: 0.7965616045845272



In [13]:
print("--------------------------  TEST  ---------------------------")
X_test = transformer1.transform(X_test)
X_test = transformer2.transform(X_test)
X_test = transformer3.transform(X_test)
X_test = transformer4.transform(X_test)
X_test = transformer5.transform(X_test)
X_test = transformer6.transform(X_test)
X_test = transformer7.transform(X_test)

pred = model.predict(X_test)
print('test accuracy: {}'.format(accuracy_score(y_test, pred)))
print()

--------------------------  TEST  ---------------------------
test accuracy: 0.7786259541984732



In [14]:
X_test.shape, y_test.shape, pred.shape

((262, 8), (262,), (262,))

In [15]:
X_test

array([[ 0.71625076, -0.50520014, -0.76104239, ..., -0.82090925,
        -0.49492069, -0.03091962],
       [-0.61610627, -0.24884131,  1.31398725, ...,  0.99619854,
        -0.49492069, -0.03091962],
       [ 1.02974653, -0.13146268, -0.76104239, ..., -0.82090925,
        -0.49492069, -0.03091962],
       ...,
       [ 0.08925922, -0.40534614, -0.76104239, ..., -0.82090925,
        -0.49492069, -0.03091962],
       [-0.22423656, -0.40534614, -0.76104239, ..., -0.82090925,
        -0.49492069, -0.03091962],
       [-0.38098444, -0.50218351, -0.76104239, ..., -0.82090925,
         2.02052574, -0.03091962]])