In [1]:
import torch
import joblib
from tqdm import tqdm
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr

In [2]:
x_train = torch.load('../data/X_tensor_APSIPA.pt')
y_train = torch.load('../data/y_tensor_APSIPA.pt')

In [3]:
X_train = []
for x in x_train:
    vector = [v.detach().numpy() for v in x]
    X_train.append(vector)

In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
def get_split(input_x, input_y, indexes):
    out_x, out_y = [], []
    for index in indexes:
        out_x.append(input_x[index])
        out_y.append(input_y[index])
    return out_x, out_y

In [6]:
def get_etr_model():
    return ExtraTreesRegressor(
    n_estimators=37,
    min_samples_split=15,
    min_samples_leaf=4,
    max_features='log2',  # type: ignore
    max_depth=7
)

In [7]:
def get_svr_model():
    return SVR(
        kernel='rbf',
        gamma=1,  # type: ignore
        epsilon=0.01,
        degree=2,
        C=5
    )

In [8]:
def get_lgbm_model():
    return lgb.LGBMRegressor(
        subsample_for_bin=140000,
        reg_lambda=0.1,
        reg_alpha=1.0,
        num_leaves=100,
        n_estimators=166,
        min_split_gain=1,
        min_child_weight=0.0001,
        min_child_samples=20,
        learning_rate=0.1,
        colsample_bytree=1.0,
        boosting_type='dart'
    )

In [9]:
models = ['lgbm', 'svr', 'etr']

In [10]:
results = []
for i, (train_index, test_index) in tqdm(enumerate(kf.split(X_train))):
    result = {'Fold': i}
    xtrain, ytrain = get_split(X_train, y_train, train_index)
    xtest, ytest = get_split(X_train, y_train, test_index)
    for model_name in models:
        if model_name == 'lgbm':
            model = get_lgbm_model()
        if model_name == 'svr':
            model = get_svr_model()
        if model_name == 'etr':
            model = get_etr_model()
        model.fit(xtrain, ytrain)
        ypred = model.predict(xtest)
        result[f'{model_name}-pearson'] = pearsonr(ytest, ypred)[0]
        result[f'{model_name}-spearman'] = spearmanr(ytest, ypred)[0]
        result[f'{model_name}-mse'] = mean_squared_error(ytest, ypred)
    results.append(result)

5it [00:02,  2.12it/s]


In [11]:
df_results = pd.DataFrame(results)

In [12]:
to_concat = {
    'Fold': 'mean',
    'lgbm-pearson': df_results.loc[:, 'lgbm-pearson'].mean(),
    'lgbm-spearman': df_results.loc[:, 'lgbm-spearman'].mean(),
    'lgbm-mse': df_results.loc[:, 'lgbm-mse'].mean(),
    'svr-pearson': df_results.loc[:, 'svr-pearson'].mean(),
    'svr-spearman': df_results.loc[:, 'svr-spearman'].mean(),
    'svr-mse': df_results.loc[:, 'svr-mse'].mean(),
    'etr-pearson': df_results.loc[:, 'etr-pearson'].mean(),
    'etr-spearman': df_results.loc[:, 'etr-spearman'].mean(),
    'etr-mse': df_results.loc[:, 'etr-mse'].mean()
}

In [13]:
df_conc = pd.DataFrame([to_concat])
df_results = pd.concat([df_results, df_conc])

In [14]:
df_results

Unnamed: 0,Fold,lgbm-pearson,lgbm-spearman,lgbm-mse,svr-pearson,svr-spearman,svr-mse,etr-pearson,etr-spearman,etr-mse
0,0,0.886926,0.908853,0.27459,0.912153,0.918939,0.265252,0.905598,0.918477,0.288156
1,1,0.912899,0.908904,0.463565,0.915817,0.902961,0.345724,0.910011,0.906087,0.371756
2,2,0.947781,0.947909,0.384239,0.928589,0.922569,0.346167,0.932183,0.937628,0.350606
3,3,0.934168,0.951592,0.368444,0.927308,0.944781,0.315612,0.931915,0.952664,0.309957
4,4,0.890025,0.94508,0.363529,0.898692,0.943143,0.342812,0.892689,0.935775,0.376331
0,mean,0.91436,0.932468,0.370873,0.916512,0.926479,0.323113,0.914479,0.930126,0.339361
