In [None]:
#hide
%reload_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
import numpy as np

from task_substitution.data import *
from task_substitution.model import *
from task_substitution.external_data import *

In [None]:
# default_exp train_test_similarity

# TrainTestSimilarity

> Class that would takes train and test sets and checks whether they come from same distribution or not.

In [None]:
#hide
from nbdev.showdoc import *

### Runner Class

In [None]:
#export
class TrainTestSimilarity:
    """Class to check whether train and test come from same distribution or not using task substitution"""
    def __init__(self, cat_flds:list=None, ignore_flds:list=None, perf_fn=None, split_args:dict=None, model_args:dict=None):
        self.dataset_args = {'cat_flds': cat_flds,
                             'ignore_flds': ignore_flds
                            }
        self.model_args = model_args
        self.split_args = split_args
        
        self.perf_fn = perf_fn
        
    def check(self, X_train, y_train, X_test):
        """Check whether train and test come from same distribution or not."""
        model = Model(**self.model_args)
        fold_runs = model.cv(X_train, y_train, self.perf_fn)
        
        self.trained_model = model.fit(X_train, y_train)
        pred = self.trained_model.predict(X_test)
        
        return fold_runs, pred
        
    def run(self, train, test):
        train_cpy = train.copy()
        test_cpy  = test.copy()
        
        # create a new column which represents whether instance comes from
        # training set or test set. 
        train_cpy = train_cpy.assign(is_test=0)
        test_cpy  = test_cpy.assign(is_test=1)
        
        # combine train and test datasets
        df_cpy = pd.concat((train_cpy, test_cpy), axis=0)
        
        # shuffle the dataset
        df_cpy = df_cpy.sample(frac=1.)
        df_cpy.index = np.arange(len(df_cpy))
        
        # add is_test as new target field for the dataset
        self.dataset_args['target_fld'] = 'is_test'
        
        # create dataset class
        data = Dataset(df_cpy, **self.dataset_args)
        
        # label encode categorical variables
        df_cpy = data.preprocess()
        
        # split the dataset into train and test 
        train, test = Dataset.split_train_test(df_cpy, self.split_args)
        
        # create target variable
        y_train = train[self.dataset_args['target_fld']]
        X_train = train.drop(self.dataset_args['target_fld'], axis=1)
        
        y_test = test[self.dataset_args['target_fld']]
        X_test = test.drop(self.dataset_args['target_fld'], axis=1)
        
        # train model to recover missing values
        fold_runs, preds = self.check(X_train, y_train, X_test)
        
        # test performance
        test_perf = self.perf_fn(y_test, preds)
        print(f'Performance on unseen dataset: {test_perf}')
        
        return test_perf

### Usage

In [None]:
train = get_fake_numeric_data(); train.head()

Unnamed: 0,f0,f1,f2,f3,f4
0,0.256065,0.477129,0.542057,0.034582,0.222517
1,0.071871,0.133654,0.067683,0.053721,0.986306
2,0.890523,0.436129,0.386798,0.687215,0.165514
3,0.168903,0.399847,0.028072,0.326286,0.085797
4,0.905647,0.979213,0.624253,0.829828,0.531663


In [None]:
test = get_fake_numeric_data(); test.head()

Unnamed: 0,f0,f1,f2,f3,f4
0,0.454871,0.220669,0.95183,0.157362,0.868527
1,0.611131,0.331115,0.130813,0.824166,0.641296
2,0.121134,0.866722,0.579458,0.050959,0.416681
3,0.121944,0.513375,0.497313,0.105572,0.235918
4,0.193114,0.553202,0.015797,0.265587,0.355239


In [None]:
from sklearn.metrics import roc_auc_score

split_args = {'test_size': 0.2, 'random_state': 41}

model_args = {
    'num_boost_round': 100,
    'objective': 'binary',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'nthread': -1,
    'seed': 41
}

# consider f4 to be the target variable in both the datasets
tte = TrainTestSimilarity(cat_flds=[], 
                          ignore_flds=['f4'],
                          perf_fn=roc_auc_score,
                          split_args=split_args, 
                          model_args=model_args)
tte.run(train, test)



Fold: 0
Performance: 0.47775829417128424
Fold: 1
Performance: 0.4992955762186531
Fold: 2
Performance: 0.4637426077161363
Mean performance: 0.4802654927020245, Std performance: 0.014622310024745249
Performance on unseen dataset: 0.49553649571153513


0.49553649571153513