In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp model
# default_cls_lvl 3

In [None]:
#export
import pandas as pd
import numpy as np
import string

import lightgbm as lgb
from sklearn.model_selection import KFold

# module name here

> API details.

In [None]:
#hide
from nbdev.showdoc import *

### Model Class

In [None]:
#export
class Model:
    def __init__(self, **model_kwargs):
        self.params = model_kwargs
        self.num_boost_round = self.params['num_boost_round']
        
        # remove num estimators key from the model parameters
        del self.params['num_boost_round']
    
    def fit(self, X:pd.DataFrame, y:pd.Series):
        ltrain = lgb.Dataset(X, y)
        self.model  = lgb.train(self.params, ltrain, self.num_boost_round)
        
        return self.model
    
    def cv(self, X:pd.DataFrame, y:pd.Series, perf_fn, **cv_params)->np.ndarray:
        kf = KFold(**cv_params)
        fold_perfs = []
        
        for index, (itr, ite) in enumerate(kf.split(X)):
            print(f'Fold: {index}')
            
            Xtr, ytr   = X.iloc[itr], y.iloc[itr]
            Xval, yval = X.iloc[ite], y.iloc[ite]
        
            ltrain = lgb.Dataset(Xtr, ytr)
            
            model = lgb.train(self.params, ltrain, self.num_boost_round)
            preds = model.predict(Xval)
            
            fold_perf = perf_fn(yval, preds)
            print(f'Performance: {fold_perf}')
            
            fold_perfs.append(fold_perf)
        
        print(f'Mean performance: {np.mean(fold_perfs)}, Std performance: {np.std(fold_perfs)}')
        
        return np.array(fold_perfs)
    
    def predict(self, Xtest)->np.ndarray:
        preds = self.model.predict(Xtest)
        
        return np.array(preds)

In [None]:
SIZE = 100
NUM_NANS = 5
example_df = pd.DataFrame({'c1': np.random.rand(SIZE, ),
                           'c2': [string.ascii_lowercase[np.random.randint(low=0, high=26)] for i in range(SIZE)],
                           'c3': np.random.permutation([np.nan] * NUM_NANS + list(np.random.rand(SIZE - NUM_NANS, )))
                          })
example_df

Unnamed: 0,c1,c2,c3
0,0.802274,n,0.889491
1,0.700237,q,0.979652
2,0.826138,m,0.299396
3,0.381898,u,0.068205
4,0.232103,e,0.703452
...,...,...,...
95,0.635907,x,0.900582
96,0.924171,f,0.887975
97,0.974560,c,0.744480
98,0.261153,t,0.363194


In [None]:
from task_substitution.data import *

data = Dataset(example_df, target_fld='c3', cat_flds=['c2'], ignore_flds=None)
proc_example_df = data.preprocess()

train, test = Dataset.split_train_test(proc_example_df, target_fld='c3')

In [None]:
from sklearn.metrics import mean_squared_error

params = {'num_boost_round': 100,
          'objective': 'regression',
          'num_leaves': 31,
          'seed': 41
         }

model = Model(**params)

y_train = train['c3']
X_train = train.drop('c3', axis=1)

X_valid = test.drop('c3', axis=1)

cv_params = {'n_splits': 5,
             'shuffle': True,
             'random_state': True
            }

model.cv(X_train, y_train, mean_squared_error, **cv_params)

Index: 0
Performance: 0.12761819458109147
Index: 1
Performance: 0.08611697110621237
Index: 2
Performance: 0.08890789555489446
Index: 3
Performance: 0.15459788005777875
Index: 4
Performance: 0.0676545156217236
Mean performance: 0.10497909138434014, Std performance: 0.031564980771756436


array([0.12761819, 0.08611697, 0.0889079 , 0.15459788, 0.06765452])