In [None]:
#hide
%reload_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
import numpy as np

from task_substitution.data import *
from task_substitution.model import *
from task_substitution.external_data import *

In [None]:
# default_exp recover_missing

# Runner

> Class that would take dataset and model args and recover missing values for a feature.

In [None]:
#hide
from nbdev.showdoc import *

### Runner Class

In [None]:
#export
class RecoverMissing:
    """Recover missing values for a feature using task substitution."""
    def __init__(self, target_fld:str, cat_flds:list=None, ignore_flds:list=None, perf_fn=None, split_args:dict=None, model_args:dict=None):
        self.dataset_args = {'target_fld': target_fld,
                             'cat_flds': cat_flds,
                             'ignore_flds': ignore_flds
                            }
        
        self.perf_fn = perf_fn
        self.split_args = split_args
        self.model_args = model_args
        
    def cv(self, X_train, y_train, X_test):
        model = Model(**self.model_args)
        fold_runs = model.cv(X_train, y_train, self.perf_fn)
        
        return fold_runs
        
    def recover(self, X_train, y_train, X_test):
        model = Model(**self.model_args)
        self.trained_model = model.fit(X_train, y_train)
        recovered_values = self.trained_model.predict(X_test)
        
        return recovered_values
        
    def run(self, df):
        df_cpy = df.copy()
        
        # create dataset class
        data = Dataset(df_cpy, **self.dataset_args)
        
        # label encode categorical variables
        df_cpy = data.preprocess()
        
        # store original index so that we can reindex the dataframe later
        # to preserve the index of the original dataframe.
        orig_index_order = df_cpy.index 
        
        # split the dataset into train and test based on missing values in the
        # feature which we want to recover
        train, test = Dataset.split_train_test_by_null(df_cpy, self.dataset_args['target_fld'])
        
        
        # further split train into tr and te
        # do cross-validation on tr and report final performance
        # on te
        tr, te = Dataset.split_train_test(train, self.split_args)
        
        ytr = tr[self.dataset_args['target_fld']]
        xtr = tr.drop(self.dataset_args['target_fld'], axis=1)
        yte = te[self.dataset_args['target_fld']]
        xte = te.drop(self.dataset_args['target_fld'], axis=1)
        
        fold_runs = self.cv(xtr, ytr, xte)
        pred = self.recover(xtr, ytr, xte)
        unseen_perf = self.perf_fn(yte, pred)
        print(f'Performance on unseen dataset: {unseen_perf:.3f}')
        
        
        # create target variable
        y_train = train[self.dataset_args['target_fld']]
        X_train = train.drop(self.dataset_args['target_fld'], axis=1)
        
        X_test = test.drop(self.dataset_args['target_fld'], axis=1)
        
        # train model to recover missing values
        y_test = self.recover(X_train, y_train, X_test)
        y_test = pd.Series(y_test, index=test.index)
        
        recovered_target = pd.concat([y_train, y_test]).reindex(orig_index_order)
        df_cpy.loc[:, self.dataset_args['target_fld']] = recovered_target
        
        return df_cpy

### Usage

In [None]:
train = get_fake_data_with_missing_values(); train.head()

Unnamed: 0,f1,f2,f3
0,0.170689,0,0.312148
1,0.350181,0,0.210611
2,0.339072,0,0.525557
3,0.157173,0,0.673594
4,0.50786,0,0.726123


In [None]:
(train.isnull().sum() / len(train)).sort_values(ascending=False)

f3    0.02
f2    0.00
f1    0.00
dtype: float64

In [None]:
train.dtypes

f1    float64
f2      int64
f3    float64
dtype: object

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, mean_squared_error

def run(train, params, num_boost_round):
    
    train_cpy = train.copy()
    
    y = train_cpy.f2
    X = train_cpy.drop('f2', axis=1)
    
    kf = KFold(shuffle=True, random_state=41)
    perfs = []
    
    for idx, (itr, ite) in enumerate(kf.split(X)):
        Xtr, ytr = X.iloc[itr], y.iloc[itr]
        Xval, yval = X.iloc[ite], y.iloc[ite]
        
        ltrain = lgb.Dataset(Xtr, ytr)
        
        model = lgb.train(params, ltrain, num_boost_round)
        preds = model.predict(Xval)
        
        fold_perf = log_loss(yval, preds)
        perfs.append(fold_perf)
    
    print(f'mean perf: {np.mean(perfs)}, std perf: {np.std(perfs)}')

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 20
         }

num_boost_round = 100
run(train, params, num_boost_round)



mean perf: 0.8460135340844301, std perf: 0.015681166820867853


In [None]:
model_args = {'objective': 'regression',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 40,
          'num_boost_round': 100,
          'seed': 41
         }

split_args = {
    'test_size': .2,
    'random_state': 41
}

rec = RecoverMissing(target_fld='f3',
                     cat_flds=[],
                     ignore_flds=['f2'],
                     perf_fn=lambda tr,pe: np.sqrt(mean_squared_error(tr, pe)),
                     split_args=split_args,
                     model_args=model_args
                    )

train_sub = rec.run(train)

Fold: 0
Performance: 0.2999708369544806
Fold: 1
Performance: 0.30431290980374
Fold: 2
Performance: 0.297611883751379
Mean performance: 0.3006318768365332, Std performance: 0.0027753279485318826
Performance on unseen dataset: 0.308




In [None]:
train.f3.isnull().sum(), train_sub.f3.isnull().sum()

(20, 0)

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 20
         }

num_boost_round = 100
train_sub = train_sub.assign(f2=train.f2)
run(train_sub, params, num_boost_round)

mean perf: 0.8402848595155269, std perf: 0.022482434047051185


