# Pipeline implementation

In this notebook, I will try to explore sklearn's `Pipeline` to implement the entire workflow of this project. This will enable me to easily crossvalidate and check results.

In [None]:
import logging

import pandas as pd

# preprocessing and feature selection
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedShuffleSplit

# sklearn models
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

In [None]:
def get_metafeatures(df):
    """
    Get metadata of the columns in the training set.
    
    Returns a metadata DataFrame.
    """
    logging.info('Creating metafeatures dataframe.')
    metafeatures = []
    rows = df.shape[0]
    for col in df.columns:
        d = {'column': col,
             'n_unique': df[col].nunique(),
             'missing': df[col].isnull().sum()*1.0/rows,
             'type': df[col].dtype}
        metafeatures.append(d)
    return pd.DataFrame(metafeatures)

def le_columns(df, columns):
    """Feature engineering and cleaning, prepare for training.

    Args:
    -----
    df: dataframe
    columns: list of categorical columns

    Returns:
    --------
    Transformed dataframe
    """
    logging.info('LabelEncoder for %s columns', len(columns))
    le = LabelEncoder()
    for col in columns:
        df.loc[:, col] = le.fit_transform(df[col])
    return df

def get_cols(df):
    """Returns list of categorical columns and columns to drop."""
    meta = get_metafeatures(df)
    categorical_columns = meta.loc[meta['type'] == 'object', 'column'].tolist()
    cols_to_drop = meta.loc[meta['missing'] > 0.5, 'column'].tolist()
    return categorical_columns, cols_to_drop

def get_X_y(df):
    """Drop columns, build features, impute missing values and return X and y,
    for training."""
    cat_cols, cols_to_drop = get_cols(df)
    df = df.pipe(le_columns, cat_cols)
    feature_columns = set(df.columns) - set(['id', 'country', 'poor']) - set(cols_to_drop)
    X = df.loc[:, feature_columns].as_matrix()
    try:
        y = df.loc[:, 'poor'].as_matrix()
    except:
        y = None
    return X, y

#### Random Forests

In [None]:
rf_pipe = Pipeline([
    ('imputer', Imputer()),
    ('feature_selection', SelectFromModel(LogisticRegression(penalty="l1"))),
    ('clf', RandomForestClassifier())
])

rf_param_grid = [
    {
        'imputer__strategy': ['mean', 'median'],
        'clf__n_estimators': [50, 100],
        'clf__class_weight': ['balanced_subsample', 'balanced']
    }
]

rf_grid = GridSearchCV(rf_pipe, cv=10, n_jobs=-1, param_grid=rf_param_grid, scoring='neg_log_loss')

#### Logistic Regression

In [None]:
lr_pipe = Pipeline([
    ('imputer', Imputer()),
    ('clf', LogisticRegression())
])

lr_param_grid = [
    {
        'imputer__strategy': ['mean', 'median'],
        'clf__penalty': ['l1'],
        'clf__C': [0.01, 0.1, 1, 10]
    }
]

lr_grid = GridSearchCV(lr_pipe, cv=10, n_jobs=-1, param_grid=lr_param_grid, scoring='neg_log_loss')

#### Extra Trees

In [None]:
etc_pipe = Pipeline([
    ('imputer', Imputer()),
    ('clf', ExtraTreesClassifier())
])

etc_param_grid = [
    {
        'imputer__strategy': ['mean', 'median'],
        'clf__criterion': ['entropy', 'gini'],
        'clf__n_estimators': [50, 100],
        'clf__class_weight': ['balanced_subsample', 'balanced']
    }
]

etc_grid = GridSearchCV(etc_pipe, cv=10, n_jobs=-1, param_grid=etc_param_grid, scoring='neg_log_loss')

#### XGBoost Classifier

In [None]:
xgb_pipe = Pipeline([
    ('imputer', Imputer()),
    ('clf', XGBClassifier())
])

xgb_param_grid = [
    {
        'imputer__strategy': ['mean', 'median'],
        'clf__objective': ['binary:logistic']
    }
]

xgb_grid = GridSearchCV(xgb_pipe, cv=10, n_jobs=-1, param_grid=xgb_param_grid, scoring='neg_log_loss')

### Run tests - train classifier and predict

In [None]:
def local_cv_score(country, grid):
    """Check local cv scores using a train-test split of 80-20."""
    df = pd.read_csv('../data/raw/{}_hhold_train.csv'.format(country))
    X, y = get_X_y(df)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=2)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    grid.fit(X_train, y_train)
    
    preds = grid.best_estimator_.predict_proba(X_test)[:, 1]
    return log_loss(y_test, preds)

In [None]:
def get_predictions(country, grid):
    """Replacement for main function"""
    df = pd.read_csv('../data/raw/{}_hhold_train.csv'.format(country))
    X, y = get_X_y(df)
    grid.fit(X, y)

    test = pd.read_csv('../data/raw/{}_hhold_test.csv'.format(country))
    X_test, _ = get_X_y(test)
    preds = grid.best_estimator_.predict_proba(X_test)
    return preds, test

def make_subs(preds, test_feat, country):
    """Make submission."""
    country_sub = pd.DataFrame(data=preds[:, 1],
                               columns=['poor'],
                               index=test_feat['id'])
    # add country code for joining later
    country_sub['country'] = country
    # write submission
    return country_sub[['country', 'poor']].reset_index()

def main(country, grid):
    """Everything packaged here."""
    preds, test = get_predictions(country, grid)
    return make_subs(preds, test, country)

In [None]:
sub_a = main('A', xgb_grid)
sub_b = main('B', xgb_grid)
sub_c = main('C', xgb_grid)
submissions = pd.concat([sub_a, sub_b, sub_c])
submissions.to_csv('../data/processed/xgb_submission.csv', index=False)