## Feature engineering

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [119]:
path_to_data = Path("../data")
path_to_models = Path("../models")
df_train = pd.read_csv(path_to_data / "train.csv")
df_test = pd.read_csv(path_to_data / "test.csv")

In [17]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from scipy.stats import uniform
import warnings
warnings.filterwarnings('ignore')

In [44]:
dropped_row_subset = ["Embarked"]
train = df_train.dropna(subset=dropped_row_subset)
x_train = train.drop(columns=["Survived"])
y_train = train["Survived"]

x_test = df_test

In [23]:
class OneHotEncoding(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names

    def transform(self, df, y=None):
        return pd.get_dummies(df, columns=self.column_names)

    def fit(self, df, y=None):
        return self
    
class DropColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names

    def transform(self, df, y=None):
        return df.drop(self.column_names, axis=1)

    def fit(self, df, y=None):
        return self
    
class ColumnExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names

    def transform(self, df, y=None):
        return df.loc[:, self.column_names]

    def fit(self, df, y=None):
        return self
    
class SexBinarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        pass

    def transform(self, df, y=None):
        df.loc[:, "Sex"] = df.loc[:, "Sex"].map({"male": 0, "female": 1})
        return df

    def fit(self, df, y=None):
        return self
    
class FeatureNormalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names
        self.min_max_scalar = MinMaxScaler()

    def transform(self, df, y=None):
        df.loc[:, self.column_names] = self.min_max_scalar.transform(df[self.column_names].as_matrix())
        return df

    def fit(self, df, y=None):
        self.min_max_scalar.fit(df[self.column_names].as_matrix())
        return self
    
class FillNa(BaseEstimator, TransformerMixin):
    
    def __init__(self, method="mean", d=None):
        if d is not None:
            method = "dict"
        self.d = None
        self.method = method

    def transform(self, df, y=None):
        if self.method == "zeros":
            df.fillna(0)
        elif self.method == "mean":
            df.fillna(df.mean(), inplace=True)
        elif self.method == "dict":
            df.fillna(df.map(self.d))
        else:
            raise ValueError("Method should be 'mean' or 'zeros'")
        return df

    def fit(self, df, y=None):
        return self
    
class AddTwoCategoricalVariables(BaseEstimator, TransformerMixin):
    def __init__(self, column_1, column_2):
        self.column_1 = column_1
        self.column_2 = column_2
    
    def transform(self, df):
        df[self.column_1 + "_" + self.column_2] = (df[self.column_1].astype(float) +
                    (len(df[self.column_1].unique()) * (df[self.column_2].astype(float)))).astype("category")
        return df
    
    def fit(self, df, y=None):
        return self
    
class Numerical2Categorical(BaseEstimator, TransformerMixin):
    def __init__(self, column, ranges, labels):
        self.column = column
        self.ranges = ranges
        self.labels = labels
        
    def transform(self, df):
        df.loc[:, self.column + "_" + "cat"] = (pd.cut(df.loc[:, self.column], self.ranges, labels=self.labels))
        return df
    
    def fit(self, df, y=None):
        return self

### Create simple pipeline

In [126]:
features = ["Fare", "Pclass", "Sex", "Age", "SibSp", "Parch"]
normalize_features = ["Fare", "SibSp", "Parch", "Age"]
age_range = [0, 15, 35, 50, 80]
age_label = [0, 1, 2, 3]

In [35]:
pipeline = Pipeline([
    ("column_extractor", ColumnExtractor(features)),
    ("fill_na", FillNa("mean")),
    ("sex_binarizer", SexBinarizer()),
    ("feature_normalizer", FeatureNormalizer(normalize_features)),
    ("clf", LogisticRegression())
])

score = cross_val_score(pipeline, x_train, y_train, cv=5, scoring="accuracy")
print(f"score = {round(np.mean(score), 2)} ± {round(np.std(score), 2)}")
np.savetxt(path_to_models / 'log_reg_1.txt', pipeline.fit(x_train, y_train)["clf"].coef_[0])
np.savetxt(path_to_models / 'log_reg_1_intercept.txt', pipeline.fit(x_train, y_train)["clf"].intercept_)

score = 0.78 ± 0.02


### Create more complex pipeline

In [131]:
pipeline = Pipeline([
    ("column_extractor", ColumnExtractor(features)),
    ("fill_na", FillNa("mean")),
    ("sex_binarizer", SexBinarizer()),
    ("num2cat", Numerical2Categorical("Age", age_range, age_label)),
    ("add_age_sex", AddTwoCategoricalVariables("Age_cat", "Sex")),
    ("add_sex_class", AddTwoCategoricalVariables("Sex", "Pclass")),
    ("add_age_sex_class", AddTwoCategoricalVariables("Age_cat_Sex", "Pclass")),
    ("one_hot_encoding", OneHotEncoding(["Age_cat_Sex", "Sex_Pclass"])),
    ("drop_columns", DropColumns(["Age_cat"])),
    ("feature_normalizer", FeatureNormalizer(normalize_features)),
    ("clf", LogisticRegression())
])

score = cross_val_score(pipeline, x_train, y_train, cv=5, scoring="accuracy")
print(f"score = {round(np.mean(score), 2)} ± {round(np.std(score), 2)}")

score = 0.82 ± 0.02


In [106]:
pipeline.fit(x_train, y_train)["clf"].coef_

array([[ 0.31696511, -0.52710365,  1.61328966, -0.01713247, -2.15033703,
        -0.82052986, -0.01839819,  1.40919714, -0.68651677, -0.55501586,
        -0.9245783 ,  0.47890263,  0.57717783, -0.06675035,  0.62395954,
         0.11580678,  1.25349155, -0.65823028,  1.01586486, -0.21449029,
        -0.65606675]])

### With hyperparameter tuning

In [133]:
params_clf = {
    'clf__class_weight': ['balanced', None],
    'clf__penalty': [None, 'l1', 'l2', 'elasticnet'],
    'clf__C': uniform(loc=0, scale=4)
}

model_space = RandomizedSearchCV(pipeline, params_clf, random_state=1, n_iter=1000, cv=5, verbose=1, n_jobs=-1)
best_model = model_space.fit(x_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 2904 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 4704 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 4985 out of 5000 | elapsed:   56.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   57.0s finished


In [134]:
best_model.best_score_

0.8267948962102455

In [135]:
best_model.best_estimator_["clf"].coef_[0]

array([ 0.36772234, -0.49364316,  1.59740106, -0.01799704, -2.17914808,
       -0.75865159, -0.02730816,  1.44945167, -0.69666582, -0.5565676 ,
       -0.98665036,  0.46047189,  0.57770656, -0.04661061,  0.60583322,
        0.13012027,  1.16703162, -0.71158271,  1.03182892, -0.20896967,
       -0.60145948])

In [136]:
np.savetxt(path_to_models / 'log_reg_2.txt', best_model.best_estimator_["clf"].coef_[0])