In [None]:
#hide
%reload_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
import numpy as np

from task_substitution.data import *
from task_substitution.model import *
from task_substitution.external_data import *

In [None]:
# default_exp recover_missing

# Runner

> Class that would take dataset and model args and recover missing values for a feature.

In [None]:
#hide
from nbdev.showdoc import *

### Runner Class

In [None]:
#export
class RecoverMissing:
    """Recover missing values for a feature using task substitution."""
    def __init__(self, target_fld:str, cat_flds:list=None, ignore_flds:list=None, **model_args):
        self.dataset_args = {'target_fld': target_fld,
                             'cat_flds': cat_flds,
                             'ignore_flds': ignore_flds
                            }
        self.model_args = model_args
        
    def recover(self, X_train, y_train, X_test):
        
        perf_fn = self.model_args['perf_fn']
        del self.model_args['perf_fn']
        
        model = Model(**self.model_args)
        fold_runs = model.cv(X_train, y_train, perf_fn)
        
        self.trained_model = model.fit(X_train, y_train)
        self.recovered_values = self.trained_model.predict(X_test)
        
        return fold_runs, self.recovered_values
        
    def run(self, df):
        df_cpy = df.copy()
        
        # create dataset class
        data = Dataset(df_cpy, **self.dataset_args)
        
        # label encode categorical variables
        df_cpy = data.preprocess()
        
        # store original index so that we can reindex the dataframe later
        # to preserve the index of the original dataframe.
        orig_index_order = df_cpy.index 
        
        # split the dataset into train and test based on missing values in the
        # feature which we want to recover
        train, test = Dataset.split_train_test(df_cpy, self.dataset_args['missing_fld'])
        
        # create target variable
        y_train = train[self.dataset_args['missing_fld']]
        X_train = train.drop(self.dataset_args['missing_fld'], axis=1)
        
        X_test = test.drop(self.dataset_args['missing_fld'], axis=1)
        
        # train model to recover missing values
        fold_runs, y_test = self.recover(X_train, y_train, X_test)
        self.fold_runs = fold_runs
        
        y_test = pd.Series(y_test, index=test.index)
        
        recovered_target = pd.concat([y_train, y_test]).reindex(orig_index_order)
        df_cpy.loc[:, self.dataset_args['missing_fld']] = recovered_target
        
        return df_cpy

### Usage

In [None]:
train = get_fake_data(); train.head()

Unnamed: 0,f1,f2,f3
0,0.046903,0,0.642703
1,0.569416,1,0.265819
2,0.997847,1,0.590103
3,0.231654,0,0.463914
4,0.15311,1,0.414236


In [None]:
(train.isnull().sum() / len(train)).sort_values(ascending=False)

f3    0.02
f2    0.00
f1    0.00
dtype: float64

In [None]:
train.dtypes

f1    float64
f2      int64
f3    float64
dtype: object

### Adult Income Prediction Dataset

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, mean_squared_error

def run(train, params, num_boost_round, impute=True):
    
    train_cpy = train.copy()
    
    y = train_cpy.f2
    X = train_cpy.drop('f2', axis=1)
    
    kf = KFold(shuffle=True, random_state=41)
    perfs = []
    
    for idx, (itr, ite) in enumerate(kf.split(X)):
        Xtr, ytr = X.iloc[itr], y.iloc[itr]
        Xval, yval = X.iloc[ite], y.iloc[ite]
        
        ltrain = lgb.Dataset(Xtr, ytr)
        
        model = lgb.train(params, ltrain, num_boost_round)
        preds = model.predict(Xval)
        
        fold_perf = log_loss(yval, preds)
        perfs.append(fold_perf)
    
    print(f'mean perf: {np.mean(perfs)}, std perf: {np.std(perfs)}')

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 20
         }

num_boost_round = 100
run(train, params, num_boost_round)



mean perf: 0.8617958875357795, std perf: 0.014028989093387898


In [None]:
params = {'objective': 'regression',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 40,
          'num_boost_round': 100,
          'perf_fn': lambda tr,pe: np.sqrt(mean_squared_error(tr, pe)),
          'seed': 41
         }

rec = RecoverMissing(target_fld='f3',
                     cat_flds=[],
                     ignore_flds=['f2'],
                     **params
                    )

train_sub = rec.run(train)



Fold: 0
Performance: 0.28837683157928873
Fold: 1
Performance: 0.2973838996314305
Fold: 2
Performance: 0.3018878616306113
Mean performance: 0.2958828642804435, Std performance: 0.005617046294131228
y_test  [0.54592897 0.50897151 0.39961943 0.46732401 0.53259094 0.63015683
 0.5735691  0.55372939 0.43246685 0.43994847 0.5565569  0.64702419
 0.48491624 0.5280441  0.46950199 0.40768427 0.4199819  0.54712394
 0.49920951 0.57728937]


`12.714812969371925, Std performance: 0.8771298233463387`

In [None]:
train.f3.isnull().sum(), train_sub.f3.isnull().sum()

(20, 0)

In [None]:
params = {'objective': 'binary',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'min_data_in_leaf': 20
         }

num_boost_round = 100
train_sub = train_sub.assign(f2=train.f2)
run(train_sub, params, num_boost_round, impute=False)



mean perf: 0.8472708470603166, std perf: 0.02556438712206216
