In [1]:
import torch
import joblib
from tqdm import tqdm
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr

In [2]:
x_train = torch.load('../data/X_tensor_WPC.pt')
y_train = torch.load('../data/y_tensor_WPC.pt')

In [3]:
x_test = torch.load('../data/X_tensor_APSIPA.pt')
y_test = torch.load('../data/y_tensor_APSIPA.pt')

In [4]:
x_train = [[v.cpu().detach().numpy() for v in x] for x in x_train]
x_test = [[v.cpu().detach().numpy() for v in x] for x in x_test]

In [5]:
def get_etr_model():
    return ExtraTreesRegressor(
    n_estimators=37,
    min_samples_split=15,
    min_samples_leaf=4,
    max_features='log2',  # type: ignore
    max_depth=7
)

In [6]:
def get_svr_model():
    return SVR(
        kernel='rbf',
        gamma=1,  # type: ignore
        epsilon=0.01,
        degree=2,
        C=5
    )

In [7]:
def get_lgbm_model():
    return lgb.LGBMRegressor(
        subsample_for_bin=140000,
        reg_lambda=0.1,
        reg_alpha=1.0,
        num_leaves=100,
        n_estimators=166,
        min_split_gain=1,
        min_child_weight=0.0001,
        min_child_samples=20,
        learning_rate=0.1,
        colsample_bytree=1.0,
        boosting_type='dart'
    )

In [8]:
models = ['lgbm', 'svr', 'etr']

In [9]:
results = []
for model_name in models:
    result = {}
    if model_name == 'lgbm':
        model = get_lgbm_model()
    elif model_name == 'svr':
        model = get_svr_model()
    elif model_name == 'etr':
        model = get_etr_model()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    result[f'{model_name}-pearson'] = pearsonr(y_test, y_pred)[0]
    result[f'{model_name}-spearman'] = spearmanr(y_test, y_pred)[0]
    result[f'{model_name}-rmse'] = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append(result)   

In [10]:
df_results = pd.DataFrame(results)

In [11]:
to_concat = {
    'Fold': 'mean',
    'lgbm-pearson': df_results.loc[:, 'lgbm-pearson'].mean(),
    'lgbm-spearman': df_results.loc[:, 'lgbm-spearman'].mean(),
    'lgbm-rmse': df_results.loc[:, 'lgbm-rmse'].mean(),
    'svr-pearson': df_results.loc[:, 'svr-pearson'].mean(),
    'svr-spearman': df_results.loc[:, 'svr-spearman'].mean(),
    'svr-rmse': df_results.loc[:, 'svr-rmse'].mean(),
    'etr-pearson': df_results.loc[:, 'etr-pearson'].mean(),
    'etr-spearman': df_results.loc[:, 'etr-spearman'].mean(),
    'etr-rmse': df_results.loc[:, 'etr-rmse'].mean()
}

In [13]:
df_conc = pd.DataFrame([to_concat])
df_results = pd.concat([df_results, df_conc])

In [14]:
df_results

Unnamed: 0,lgbm-pearson,lgbm-spearman,lgbm-rmse,svr-pearson,svr-spearman,svr-rmse,etr-pearson,etr-spearman,etr-rmse,Fold
0,0.807764,0.849659,53.540133,,,,,,,
1,,,,0.845333,0.930901,59.653402,,,,
2,,,,,,,0.858049,0.925072,57.321624,
0,0.807764,0.849659,53.540133,0.845333,0.930901,59.653402,0.858049,0.925072,57.321624,mean
0,0.807764,0.849659,53.540133,0.845333,0.930901,59.653402,0.858049,0.925072,57.321624,mean


In [15]:
df_conc

Unnamed: 0,Fold,lgbm-pearson,lgbm-spearman,lgbm-rmse,svr-pearson,svr-spearman,svr-rmse,etr-pearson,etr-spearman,etr-rmse
0,mean,0.807764,0.849659,53.540133,0.845333,0.930901,59.653402,0.858049,0.925072,57.321624
