In [1]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_friedman2, make_regression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelBinarizer, OneHotEncoder
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, RidgeCV
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor,VotingRegressor, StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.svm import LinearSVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [2]:
pd.options.display.max_columns = None

cv = KFold(n_splits=5)
simple_pre_flag = True
scale_data = False

cat_cols = ['Neutered/Spayed', 'Gender', 'MIXED_BREED_FLAG', 'BREED', 'neuter_dt']
num_cols = ['ALB', 'ALKP', 'ALT', 'AMYL', 'BUN', 'CA', 'CHOL', 'CREA', 'EOSINOPHIL', 'GLOB', 'GLU',
       'HCT', 'HGB', 'LYMPHOCYTE', 'MCH', 'MCHC', 'MCV', 'MONOCYTE', 'MPV',
       'PHOS', 'PLT', 'RBC', 'RDW', 'TBIL', 'TP', 'WBC']

In [3]:
if simple_pre_flag:
    X_train = pd.read_csv('data/X_train_pre_simple.csv')
    X_test = pd.read_csv('data/X_test_pre_simple.csv')
    y_train = pd.read_csv('data/y_train_pre_simple.csv').to_numpy()
    y_test = pd.read_csv('data/y_test_pre_simple.csv').to_numpy()

else:
    X_train = pd.read_csv('data/X_train_pre.csv')
    X_test = pd.read_csv('data/X_test_pre.csv')
    y_train = pd.read_csv('data/y_train_pre.csv').to_numpy()
    y_test = pd.read_csv('data/y_test_pre.csv').to_numpy()

X_train.head()

Unnamed: 0,MIXED_BREED_FLAG_0,MIXED_BREED_FLAG_1,BREED_0,BREED_1,BREED_2,BREED_3,BREED_4,BREED_5,BREED_6,BREED_7,BREED_8,BREED_9,BREED_10,BREED_11,BREED_12,BREED_13,BREED_14,BREED_15,BREED_16,BREED_17,BREED_18,BREED_19,BREED_20,BREED_21,BREED_22,BREED_23,BREED_24,BREED_25,BREED_26,BREED_27,BREED_28,BREED_29,BREED_30,BREED_31,BREED_32,BREED_33,BREED_34,BREED_35,BREED_36,BREED_37,BREED_38,BREED_39,BREED_40,BREED_41,BREED_42,BREED_43,BREED_44,BREED_45,BREED_46,BREED_47,BREED_48,BREED_49,BREED_50,BREED_51,BREED_52,BREED_53,BREED_54,BREED_55,BREED_56,BREED_57,BREED_58,BREED_59,BREED_60,BREED_61,BREED_62,BREED_63,BREED_64,BREED_65,BREED_66,BREED_67,BREED_68,BREED_69,BREED_70,BREED_71,BREED_72,BREED_73,BREED_74,BREED_75,BREED_76,BREED_77,BREED_78,BREED_79,BREED_80,BREED_81,BREED_82,BREED_83,BREED_84,BREED_85,BREED_86,BREED_87,BREED_88,BREED_89,BREED_90,BREED_91,BREED_92,BREED_93,BREED_94,neuter_dt_0,ALB,ALKP,ALT,AMYL,BUN,CA,CHOL,CREA,EOSINOPHIL,GLOB,GLU,HCT,HGB,LYMPHOCYTE,MCH,MCHC,MCV,MONOCYTE,MPV,PHOS,PLT,RBC,RDW,TBIL,TP,WBC,Neutered/Spayed_0,Neutered/Spayed_1,Neutered/Spayed_2,Gender_0
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.565117,-0.083936,3.298568,0.055539,-0.923631,1.045984,0.639321,-0.119074,3.928146e-16,1.405175,0.291423,0.981101,0.7352667,-1.405885e-15,0.06288612,-0.04646169,0.4715268,-1.40082,-0.4903108,-0.59361,-1.944974,0.079278,-0.2674955,0.801551,1.394026,0.1218854,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.014025,3.13968,0.555261,-0.959741,-0.690719,0.441009,-1.1869,-1.000346,3.928146e-16,-1.126764,-0.937075,0.0,8.597155e-16,-1.405885e-15,4.628292e-16,1.066452e-15,1.727638e-15,0.0,1.207058e-15,1.676055,1.485744e-15,0.0,2.121587e-15,-0.530343,-1.410314,-4.822035e-17,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,2.144259,-0.517364,-0.391833,-0.954587,-1.389455,0.138522,1.210015,-0.119074,-0.9681104,-0.765058,0.09947,-0.146499,-0.244786,2.259567,0.04725315,-0.09599132,0.4715268,-0.535286,-0.5854428,-0.671874,-1.396087,-0.091589,-0.3112883,0.040468,0.552723,-0.1778024,1.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.565117,-0.65281,-1.07766,3.802281,-0.457807,-0.466454,-1.985872,0.006822,-0.9681104,0.320059,-0.246045,-0.941572,-0.7075889,0.7855043,-0.02244386,0.1951833,-0.379475,1.368889,-0.1641447,-0.906667,1.485744e-15,-0.133847,0.3296782,0.230739,0.412507,-0.08985056,0.0,1.0,0.0,-1.0
4,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.014025,-0.571542,1.175771,-0.70721,-0.690719,-0.617696,0.582251,-0.119074,0.06385263,-0.2225,1.212797,0.0,-0.06329475,-1.123851,-0.09148949,1.066452e-15,1.727638e-15,-0.950742,1.207058e-15,-2.315424,-1.152137,0.056312,0.6242838,-0.530343,-0.849446,0.04859223,1.0,0.0,0.0,1.0


In [4]:
if scale_data:
    scaler = StandardScaler() #StandardScaler() #RobustScaler() # or MinMaxScaler()

    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])

    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)

X_train, X_test = X_train.to_numpy(), X_test.to_numpy()
X_train.shape, y_train.shape

((721512, 128), (721512, 1))

In [5]:
model = LGBMRegressor(n_jobs=-1)

n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: -2.113 (0.004)


In [6]:
model = CatBoostRegressor(thread_count=-1)

n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: -2.085 (0.004)


In [7]:
model = XGBRegressor(n_jobs=-1)

n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

KeyboardInterrupt: 

In [None]:
model = RandomForestRegressor(random_state=42, n_jobs=-1)
                    
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
# Ensemble regressors
reg1 = GradientBoostingRegressor(random_state=1)
reg2 = RandomForestRegressor(random_state=42, n_jobs=-1) # takes longest
reg3 = SGDRegressor(max_iter=1000, tol=1e-3)
reg4 = XGBRegressor(n_jobs=-1)
reg5 = LGBMRegressor(n_jobs=-1)
reg6 = CatBoostRegressor(thread_count=-1)

In [None]:
vote_reg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('sgd', reg3), ('xgbr', reg4), ('lgbmr', reg5), ('cbr', reg6)])

n_scores = cross_val_score(vote_reg, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
stack_reg = StackingRegressor(
    estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3), ('xgbr', reg4), ('lgbmr', reg5), ('cbr', reg6)],
    final_estimator=XGBRegressor()
)

n_scores = cross_val_score(stack_reg, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
stack_reg.fit(X_train, y_train)
print('lRMSE: %.3f' % mean_squared_log_error(y_test, stack_reg.predict(X_test)))

In [None]:
model = CatBoostRegressor(eval_metric='MAE', thread_count=-1, metric_period=100)
model.fit(X_train, y_train)

In [None]:
print("MAE: ", mean_absolute_error(y_test, model.predict(X_test)))
for i in range(10):
    test_item = X_test[i].reshape(1, -1)
    print("GT: ", y_test[i])
    print("Regressor: ", model.predict(test_item))