In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp model
# default_cls_lvl 3

In [None]:
#export
import pandas as pd
import numpy as np
import string

import lightgbm as lgb
from sklearn.model_selection import KFold

# module name here

> API details.

In [None]:
#hide
from nbdev.showdoc import *

### Model Class

In [None]:
#export
class Model:
    def __init__(self, **model_kwargs):
        self.params = model_kwargs
        self.num_boost_round = self.params['num_boost_round']
        
        # remove num estimators key from the model parameters
        del self.params['num_boost_round']
    
    def fit(self, X:pd.DataFrame, y:pd.Series):
        ltrain = lgb.Dataset(X, y)
        self.model  = lgb.train(self.params, ltrain, self.num_boost_round)
        
        return self.model
    
    def cv(self, X:pd.DataFrame, y:pd.Series, perf_fn, **cv_params)->np.ndarray:
        kf = KFold(**cv_params)
        fold_perfs = []
        
        for index, (itr, ite) in enumerate(kf.split(X)):
            print(f'Fold: {index}')
            
            Xtr, ytr   = X.iloc[itr], y.iloc[itr]
            Xval, yval = X.iloc[ite], y.iloc[ite]
        
            ltrain = lgb.Dataset(Xtr, ytr)
            
            model = lgb.train(self.params, ltrain, self.num_boost_round)
            preds = model.predict(Xval)
            
            fold_perf = perf_fn(yval, preds)
            print(f'Performance: {fold_perf}')
            
            fold_perfs.append(fold_perf)
        
        print(f'Mean performance: {np.mean(fold_perfs)}, Std performance: {np.std(fold_perfs)}')
        
        return np.array(fold_perfs)
    
    def predict(self, Xtest)->np.ndarray:
        preds = self.model.predict(Xtest)
        
        return np.array(preds)

### Tests

In [None]:
SIZE = 10000
NUM_NANS = 500
example_df = pd.DataFrame({'c1': np.random.rand(SIZE, ),
                           'c2': [string.ascii_lowercase[np.random.randint(low=0, high=26)] for i in range(SIZE)],
                           'c3': np.random.permutation([np.nan] * NUM_NANS + list(np.random.rand(SIZE - NUM_NANS, )))
                          })
example_df.head()

Unnamed: 0,c1,c2,c3
0,0.469413,d,0.411662
1,0.838573,r,0.733036
2,0.488017,m,0.314077
3,0.604134,i,0.561229
4,0.825278,f,0.848442


In [None]:
from task_substitution.data import *

data = Dataset(example_df, target_fld='c3', cat_flds=['c2'], ignore_flds=None)
proc_example_df = data.preprocess()

train, test = Dataset.split_train_test_by_null(proc_example_df, target_fld='c3')

In [None]:
from sklearn.metrics import mean_squared_error

params = {'num_boost_round': 100,
          'objective': 'regression',
          'num_leaves': 31,
          'seed': 41
         }

model = Model(**params)

y_train = train['c3']
X_train = train.drop('c3', axis=1)

X_valid = test.drop('c3', axis=1)

cv_params = {'n_splits': 5,
             'shuffle': True,
             'random_state': True
            }

perf_fn = lambda tr, pe: np.sqrt(mean_squared_error(tr, pe))

model.cv(X_train, y_train, perf_fn, **cv_params)

Fold: 0
Performance: 0.2946840645608396
Fold: 1
Performance: 0.2915428769528317
Fold: 2
Performance: 0.29702084006365137
Fold: 3
Performance: 0.29641157363445336
Fold: 4
Performance: 0.29617771921325436
Mean performance: 0.2951674148850061, Std performance: 0.001968488429225961


array([0.29468406, 0.29154288, 0.29702084, 0.29641157, 0.29617772])

In [None]:
SIZE = 10000
NUM_NANS = 500
example_df = pd.DataFrame({'c1': np.random.rand(SIZE, ),
                           'c2': [string.ascii_lowercase[np.random.randint(low=0, high=26)] for i in range(SIZE)],
                           'c3': [string.ascii_lowercase[np.random.randint(low=0, high=26)] for i in range(SIZE)],
                           'c4': np.random.permutation([np.nan] * NUM_NANS + list(np.random.rand(SIZE - NUM_NANS, )))
                          })
example_df.head()

Unnamed: 0,c1,c2,c3,c4
0,0.353357,z,j,0.027331
1,0.119745,y,y,0.990997
2,0.479214,m,s,0.623446
3,0.585346,u,q,0.466293
4,0.853478,s,b,0.904931


In [None]:
data = Dataset(example_df, target_fld='c4', cat_flds=['c2', 'c3'], ignore_flds=None)
proc_example_df = data.preprocess()

train, test = Dataset.split_train_test_by_null(proc_example_df, target_fld='c4')

In [None]:
params = {'num_boost_round': 100,
          'objective': 'regression',
          'num_leaves': 31,
          'seed': 41
         }

model = Model(**params)

y_train = train['c4']
X_train = train.drop('c4', axis=1)

X_valid = test.drop('c4', axis=1)

cv_params = {'n_splits': 5,
             'shuffle': True,
             'random_state': True
            }

perf_fn = lambda tr, pe: np.sqrt(mean_squared_error(tr, pe))

model.cv(X_train, y_train, perf_fn, **cv_params)

Fold: 0
Performance: 0.2918221725273733
Fold: 1
Performance: 0.2944654447182648
Fold: 2
Performance: 0.29020677281091695
Fold: 3
Performance: 0.294588152920383
Fold: 4
Performance: 0.2929651161128992
Mean performance: 0.2928095318179675, Std performance: 0.0016540241745309258


array([0.29182217, 0.29446544, 0.29020677, 0.29458815, 0.29296512])