In [1]:
import numpy as np
from pathlib import Path
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error

import json

import lightgbm as lgb

In [2]:
dirs = [
             'adult',
             'aloi',
             'california_housing',
             'covtype',
             'epsilon',
             'helena',
             'higgs_small',
             'jannis',
             'microsoft',
             'yahoo',
             'year'
            ]

In [3]:
def joinData(dbName, cat_policy='ohe',seed=int(9),normalization=False, norm="l1", id=True ):
        dataset_name = dbName
        dir_ = Path('data/'+ dataset_name )
        y_train = np.load(dir_.joinpath('y_train.npy'))
        y_test = np.load(dir_.joinpath('y_test.npy'))
        y_val = np.load(dir_.joinpath('y_val.npy'))
        # y = np.concatenate((y_train,y_test,y_val), axis=0)
        y = [y_train,y_test,y_val]
        
        if dir_.joinpath('C_train.npy').exists() and not id:
            C_train = np.load(dir_.joinpath('C_train.npy'))
            C_test = np.load(dir_.joinpath('C_test.npy'))
            C_val = np.load(dir_.joinpath('C_val.npy'))
            # C = np.concatenate((C_train,C_test,C_val), axis=0)
            
            ord = OrdinalEncoder()
            C_train = ord.fit_transform(C_train)
            C_test = ord.transform(C_test)
            C_val = ord.transform(C_val)
            C = [C_train,C_test,C_val]
            
            
            if cat_policy == 'indices':
                C = C
            elif cat_policy == 'ohe':
                ohe = sklearn.preprocessing.OneHotEncoder(
                    handle_unknown='ignore', sparse=False, dtype='float32'  # type: ignore[code]
                )
                ohe.fit(C[0])
                C[0] = ohe.transform(C[0])
                C[1] = ohe.transform(C[1])
                C[2] = ohe.transform(C[2])
            elif cat_policy == 'counter':
                assert seed is not None
                loo = LeaveOneOutEncoder(sigma=0.1, random_state=seed, return_df=False)
                loo.fit(C[0], y[0])
                C[0] = loo.transform(C[0])  # type: ignore[code]
                C[1] = loo.transform(C[1])
                C[2] = loo.transform(C[2])
            result = C
                    
        if dir_.joinpath('N_train.npy').exists():
            N_train = np.load(dir_.joinpath('N_train.npy'))
            N_test = np.load(dir_.joinpath('N_test.npy'))
            N_val = np.load(dir_.joinpath('N_val.npy'))
            # N = np.concatenate((N_train,N_test,N_val), axis=0)
            N = [N_train,N_test,N_val]
            # print('size :',N_test.shape, N_val.shape)
            result = N
            
        if ('N' in locals()) and ('C' in locals()):
            result[0] = np.concatenate((C[0],N[0]), axis=1)
            result[1] = np.concatenate((C[1],N[1]), axis=1)
            result[2] = np.concatenate((C[2],N[2]), axis=1)
        #dropna
        a = ~np.isnan(result[0]).any(axis=1)
        result[0] = result[0][a]
        y[0] = y[0][a]
        a = ~np.isnan(result[1]).any(axis=1)
        result[1] = result[1][a]
        y[1] = y[1][a]
        a = ~np.isnan(result[2]).any(axis=1)
        result[2] = result[2][a]
        y[2] = y[2][a]
        if normalization:
            mmx = MinMaxScaler()
            result[0] = mmx.fit_transform(result[0])
            result[2] = mmx.transform(result[2])

            result[1] = mmx.transform(result[1])
        
        return result[0],result[1],result[2], y[0],y[1],y[2]

In [None]:
for dbs in dirs:
    print('datbase used :',dbs)
    config = {}
    config['task_type'] = json.loads(Path('data/'+dbs+'/info.json').read_text())['task_type']
    config['cat_policy'] = json.loads(Path('data/'+dbs+'/info.json').read_text())['cat_policy']
    config['norm'] = json.loads(Path('data/'+dbs+'/info.json').read_text())['norm']
    dir_ = 'data/'+ dbs
    N_train, N_test,N_val, y_train, y_test,y_val = joinData(dbs,
                                                            cat_policy=config['cat_policy'],
                                                            normalization=True, 
                                                            norm=config['norm'])
    train_data = lgb.Dataset(N_train, label=y_train)
    test_data = lgb.Dataset(N_test, label=y_test, reference=train_data)
    # Define hyperparameters
    
    # Train the LightGBM model
    num_round = 500
    if config['task_type']  != 'regression':
        params = {
        	# "objective": "binary",
            'objective': 'multiclass',
            'num_class': len(set(y_train)),
        	"boosting_type": "rf",
        	"num_leaves": 5,
        	"force_row_wise": True,
        	"learning_rate": 0.5,
        	# "metric": "binary_logloss",
        	"bagging_fraction": 0.8,
        	"feature_fraction": 0.8,
            'verbosity': 0
        }
        bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
    else :
        params = { 
        	'objective': 'regression', 
        	'metric': 'rmse', 
        	'boosting_type': 'gbdt', 
        	'num_leaves': 31, 
        	'learning_rate': 0.05, 
        	'feature_fraction': 0.9,
            'verbosity': 0
        }
        bst = lgb.LGBMRegressor(metric='rmse') 
        bst.fit(N_train, y_train)
        
    y_hat_test = bst.predict(N_test)
    # y_hat_test = (y_hat_test > 0.5).astype(int)
    if config['task_type']  != 'regression':
        y_hat_test = np.argmax(y_hat_test, axis=1)
        te_acc =  precision_recall_fscore_support(y_test, y_hat_test, average='macro')
        print("Test score: precision.      {}, recall {}, F1 {}, support {}".format(te_acc[0],te_acc[1],te_acc[2],te_acc[3]) )
    else:
        te_acc = np.sqrt(mean_squared_error(y_test, y_hat_test)) 
        print(te_acc)

datbase used : adult
Test score: precision.      0.6225690887843323, recall 0.6558600331373522, F1 0.5914561737473192, support None
datbase used : aloi
