In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [49]:
path_to_data = Path("../data")
path_to_models = Path("../models")
path_to_codegen = path_to_models / "codegen"
df_train = pd.read_csv(path_to_data / "train.csv")
df_test = pd.read_csv(path_to_data / "test.csv")

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings('ignore')

In [6]:
dropped_row_subset = ["Embarked"]
train = df_train.dropna(subset=dropped_row_subset)
x_train = train.drop(columns=["Survived"])
y_train = train["Survived"]

x_test = df_test

In [7]:
class OneHotEncoding(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names

    def transform(self, df, y=None):
        return pd.get_dummies(df, columns=self.column_names)

    def fit(self, df, y=None):
        return self
    
class DropColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names

    def transform(self, df, y=None):
        return df.drop(self.column_names, axis=1)

    def fit(self, df, y=None):
        return self
    
class ColumnExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names

    def transform(self, df, y=None):
        return df.loc[:, self.column_names]

    def fit(self, df, y=None):
        return self
    
class SexBinarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        pass

    def transform(self, df, y=None):
        df.loc[:, "Sex"] = df.loc[:, "Sex"].map({"male": 0, "female": 1})
        return df

    def fit(self, df, y=None):
        return self
    
class FeatureNormalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names=[]):
        self.column_names = column_names
        self.min_max_scalar = MinMaxScaler()

    def transform(self, df, y=None):
        df.loc[:, self.column_names] = self.min_max_scalar.transform(df[self.column_names].as_matrix())
        return df

    def fit(self, df, y=None):
        self.min_max_scalar.fit(df[self.column_names].as_matrix())
        return self
    
class FillNa(BaseEstimator, TransformerMixin):
    
    def __init__(self, method="mean", d=None):
        if d is not None:
            method = "dict"
        self.d = None
        self.method = method

    def transform(self, df, y=None):
        if self.method == "zeros":
            df.fillna(0)
        elif self.method == "mean":
            df.fillna(df.mean(), inplace=True)
        elif self.method == "dict":
            df.fillna(df.map(self.d))
        else:
            raise ValueError("Method should be 'mean' or 'zeros'")
        return df

    def fit(self, df, y=None):
        return self
    
class AddTwoCategoricalVariables(BaseEstimator, TransformerMixin):
    def __init__(self, column_1, column_2):
        self.column_1 = column_1
        self.column_2 = column_2
    
    def transform(self, df):
        df[self.column_1 + "_" + self.column_2] = (df[self.column_1].astype(float) +
                    (len(df[self.column_1].unique()) * (df[self.column_2].astype(float)))).astype("category")
        return df
    
    def fit(self, df, y=None):
        return self
    
class Numerical2Categorical(BaseEstimator, TransformerMixin):
    def __init__(self, column, ranges, labels):
        self.column = column
        self.ranges = ranges
        self.labels = labels
        
    def transform(self, df):
        df.loc[:, self.column + "_" + "cat"] = (pd.cut(df.loc[:, self.column], self.ranges, labels=self.labels))
        return df
    
    def fit(self, df, y=None):
        return self

In [45]:
from sklearn_porter import Porter

## Try different classifiers after the same pipeline

In [62]:
features = ["Fare", "Pclass", "Sex", "Age", "SibSp", "Parch"]
normalize_features = ["Fare", "SibSp", "Parch", "Age"]
age_range = [0, 15, 35, 50, 80]
age_label = [0, 1, 2, 3]

def pipeline_factory(estimator):
    return Pipeline([
        ("column_extractor", ColumnExtractor(features)),
        ("fill_na", FillNa("mean")),
        ("sex_binarizer", SexBinarizer()),
        ("feature_normalizer", FeatureNormalizer(normalize_features)),
        ("clf", estimator)
    ])

def get_clf(pipeline):
    return pipeline.best_estimator_.named_steps["clf"]

In [74]:
p = Pipeline([
    ("column_extractor", ColumnExtractor(features)),
    ("fill_na", FillNa("mean")),
    ("sex_binarizer", SexBinarizer()),
    ("feature_normalizer", FeatureNormalizer(normalize_features))
])
p.fit(x_train, y_train)
p.transform(x_test)

Unnamed: 0,Fare,Pclass,Sex,Age,SibSp,Parch
0,0.015282,3,0,0.428248,0.000,0.000000
1,0.013663,3,1,0.585323,0.125,0.000000
2,0.018909,2,0,0.773813,0.000,0.000000
3,0.016908,3,0,0.334004,0.000,0.000000
4,0.023984,3,1,0.271174,0.125,0.166667
...,...,...,...,...,...,...
413,0.015713,3,0,0.375127,0.000,0.000000
414,0.212559,1,1,0.484795,0.000,0.000000
415,0.014151,3,0,0.478512,0.000,0.000000
416,0.015713,3,0,0.375127,0.000,0.000000


## RandomForest

In [52]:
rf_params = {
    "clf__n_estimators": randint(100, 500),
    "clf__criterion": ["gini", "entropy"],
    "clf__max_depth": randint(3, 10),
    "clf__max_features": ["auto", "sqrt", "log2", None],
    "clf__bootstrap": [True],
    "clf__random_state": [30],
    "clf__warm_start": [True]
}

best_rf = RandomizedSearchCV(
    pipeline_factory(RandomForestClassifier()), rf_params,
    random_state=30, n_iter=100, cv=5, verbose=1, n_jobs=-1).fit(x_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.2min finished


In [53]:
best_rf.best_score_

0.8335555132355742

In [63]:
get_clf(best_rf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=272,
                       n_jobs=None, oob_score=False, random_state=30, verbose=0,
                       warm_start=True)

In [67]:
with open(path_to_models / "RandomForestClassifier.java", "w") as f:
    porter = Porter(get_clf(best_rf), language='java')
    f.write(porter.export(embed_data=True))

## SVC

In [69]:
svc_params = {
    "clf__C": uniform(0.5, 1.5),
    "clf__degree": randint(3, 7),
    "clf__kernel": ["rbf", "linear", "poly", "sigmoid"],
    "clf__random_state": [30]
}

best_svc = RandomizedSearchCV(
    pipeline_factory(SVC()), svc_params,
    random_state=30, n_iter=1000, cv=5, verbose=1, n_jobs=-1).fit(x_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 852 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 2452 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 4440 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  1.6min finished


In [61]:
best_svc.best_score_

0.8087856281343235

In [65]:
get_clf(best_svc)

SVC(C=4.109723262582725, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=4, gamma='scale',
    kernel='poly', max_iter=-1, probability=False, random_state=30,
    shrinking=True, tol=0.001, verbose=False)

In [68]:
with open(path_to_models / "SupportBectorMachineClassifier.java", "w") as f:
    porter = Porter(get_clf(best_svc), language='java')
    f.write(porter.export(embed_data=True))