In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
import numpy as np

from task_substitution.data import *
from task_substitution.model import *

In [None]:
# default_exp recover_missing

# Runner

> Class that would take dataset and model args and recover missing values for a feature.

In [None]:
#hide
from nbdev.showdoc import *

### Runner Class

In [None]:
#export
class RecoverMissing:
    """Recover missing values for a feature using task substitution."""
    def __init__(self, missing_fld:str, cat_flds:list=None, ignore_flds:list=None, **model_args):
        self.dataset_args = {'missing_fld': missing_fld,
                             'cat_flds': cat_flds,
                             'ignore_flds': ignore_flds
                            }
        self.model_args = model_args
        
    def recover(self, X_train, y_train, X_test):
        
        perf_fn = self.model_args['perf_fn']
        del self.model_args['perf_fn']
        
        model = Model(**self.model_args)
        fold_runs = model.cv(X_train, y_train, perf_fn)
        
        self.trained_model = model.fit(X_train, y_train)
        self.recovered_values = self.trained_model.predict(X_test)
        
        return fold_runs, self.recovered_values
        
    def run(self, df):
        df_cpy = df.copy()
        
        # create dataset class
        data = Dataset(df_cpy, **self.dataset_args)
        
        # label encode categorical variables
        df_cpy = data.preprocess()
        
        # store original index so that we can reindex the dataframe later
        # to preserve the index of the original dataframe.
        orig_index_order = df_cpy.index 
        
        # split the dataset into train and test based on missing values in the
        # feature which we want to recover
        train, test = Dataset.split_train_test(df_cpy, self.dataset_args['missing_fld'])
        
        # create target variable
        y_train = train[self.dataset_args['missing_fld']]
        X_train = train.drop(self.dataset_args['missing_fld'], axis=1)
        
        
        X_test = test.drop(self.dataset_args['missing_fld'], axis=1)
        
        # train model to recover missing values
        fold_runs, y_test = self.recover(X_train, y_train, X_test)
        self.fold_runs = fold_runs
        
        y_test = pd.Series(y_test, index=test.index)
        
        recovered_target = pd.concat([y_train, y_test]).reindex(orig_index_order)
        df_cpy.loc[:, self.dataset_args['missing_fld']] = recovered_target
        
        return df_cpy

In [None]:
train = pd.read_csv('./train.csv'); train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
(train.isnull().sum() / len(train)).sort_values(ascending=False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
Fare           0.000000
Ticket         0.000000
Parch          0.000000
SibSp          0.000000
Sex            0.000000
Name           0.000000
Pclass         0.000000
Survived       0.000000
PassengerId    0.000000
dtype: float64

In [None]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Titanic Dataset

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, mean_squared_error

def run(train, params, num_boost_round, impute=True):
    
    train_cpy = train.copy()
    
    if impute:
        # impute age with mean age
        train_cpy.loc[:, 'Age'] = train_cpy.loc[:, 'Age'].fillna(train_cpy.loc[:, 'Age'].median())
        train_cpy.drop(['Name', 'Cabin', 'Embarked'], axis=1, inplace=True)
        
        # label encode categorical features
        train_cpy.loc[:, 'Sex'] = pd.Categorical(train_cpy['Sex']).codes + 1
        train_cpy.loc[:, 'Ticket'] = pd.Categorical(train_cpy['Ticket']).codes + 1
    
    y = train_cpy.Survived
    X = train_cpy.drop('Survived', axis=1)
    
    kf = KFold(shuffle=True, random_state=41)
    perfs = []
    
    for idx, (itr, ite) in enumerate(kf.split(X)):
        Xtr, ytr = X.iloc[itr], y.iloc[itr]
        Xval, yval = X.iloc[ite], y.iloc[ite]
        
        ltrain = lgb.Dataset(Xtr, ytr)
        
        model = lgb.train(params, ltrain, num_boost_round)
        preds = model.predict(Xval)
        
        fold_perf = log_loss(yval, preds)
        perfs.append(fold_perf)
    
    print(f'mean perf: {np.mean(perfs)}, std perf: {np.std(perfs)}')

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 20
         }

num_boost_round = 100
run(train, params, num_boost_round)

mean perf: 0.49343807135220336, std perf: 0.060123682328008625


In [None]:
params = {'objective': 'regression',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 40,
          'num_boost_round': 100,
          'feature_fraction': .4,
          'perf_fn': lambda tr,pe: np.sqrt(mean_squared_error(tr, pe)),
          'seed': 41
         }

rec = RecoverMissing(missing_fld='Age',
                     cat_flds=['Sex', 'Ticket'],
                     ignore_flds=['Name', 'Cabin', 'Embarked', 'Survived'],
                     **params
                    )

train_sub = rec.run(train)

Fold: 0
Performance: 13.32016503322026
Fold: 1
Performance: 13.298255677213394
Fold: 2
Performance: 12.525303338219395
Fold: 3
Performance: 12.145355484632868
Fold: 4
Performance: 14.12604364417034
Mean performance: 13.08302463549125, Std performance: 0.6900326229858793


`12.714812969371925, Std performance: 0.8771298233463387`

In [None]:
train.Age.isnull().sum(), train_sub.Age.isnull().sum()

(177, 0)

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 20
         }

num_boost_round = 100
train_sub = train_sub.assign(Survived=train.Survived)
run(train_sub, params, num_boost_round, impute=False)

mean perf: 0.4873832900634646, std perf: 0.06138390546947665
