In [1]:
import pandas as pd 
import numpy as np 
from pathlib import Path
from pycaret.regression import *

In [2]:
data = np.load(Path().resolve().parents[1] / f"data/chrome_data.npy", allow_pickle=True)[()]
df_1 = pd.DataFrame(data['X'].astype('float64'), columns=data['features'])
df_1['CT_RT'] = data['y'].astype('float64')
del data

In [3]:
data = np.load(Path().resolve().parents[1] / f"data/aus_data.npy", allow_pickle=True)[()]
df_2 = pd.DataFrame(data['X'].astype('float64'), columns=data['features'])
df_2['CT_RT'] = data['y'].astype('float64')
del data

In [4]:
df = pd.concat([df_1, df_2])
del df_1, df_2

In [6]:
df.to_csv('/Users/mamu867/PNNL_Mac/PNNL_Code_Base/xmat-pnnl/xmat_pnnl_model/combined/df.csv')

In [6]:
exp = setup(data = df, 
            target = 'CT_RT', 
            session_id=123,
            normalize = True, 
            transformation = True,
            numeric_imputation = 'mean',
            #transform_target = True, 
            combine_rare_levels = True, 
            rare_level_threshold = 0.05,
            remove_multicollinearity = True, 
            multicollinearity_threshold = 0.95, 
            #train_size=0.8,
            log_experiment = True, 
            fold=5,
            experiment_name = 'combined')

Unnamed: 0,Description,Value
0,session_id,123
1,Target,CT_RT
2,Original Data,"(1302, 28)"
3,Missing Values,True
4,Numeric Features,24
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(911, 42)"


In [7]:
compare_models()


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,7102.2563,362855213.4314,18713.6177,0.6472,1.1047,4.3433,0.086
catboost,CatBoost Regressor,7739.067,359566198.5365,18682.1378,0.6465,1.4052,5.7123,1.02
xgboost,Extreme Gradient Boosting,7359.3254,378590230.4,18850.392,0.6385,1.2205,4.8847,0.302
lightgbm,Light Gradient Boosting Machine,8768.4828,383251375.1187,19386.1071,0.6131,1.562,12.8152,0.284
rf,Random Forest Regressor,8323.6238,389976081.0843,19507.1532,0.6117,1.2676,6.609,0.1
gbr,Gradient Boosting Regressor,8635.4338,395631652.8235,19590.2226,0.6114,1.6067,10.5115,0.038
dt,Decision Tree Regressor,9795.3349,582016646.0205,23668.6748,0.4222,1.1973,4.4014,0.008
llar,Lasso Least Angle Regression,16097.6314,708218146.1698,26307.0806,0.3004,2.4363,74.7073,0.01
lasso,Lasso Regression,16250.3432,715453843.2,26442.4207,0.293,2.4423,75.0947,0.312
lr,Linear Regression,16261.0789,715689657.6,26447.3387,0.2927,2.4455,75.2228,0.624


ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=123, verbose=0, warm_start=False)

In [8]:
ct = create_model('catboost', fold = 5)


Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,6277.3538,182685524.3917,13516.1209,0.667,1.2204,5.5968
1,8472.0495,449890446.1283,21210.6211,0.6462,1.3568,4.352
2,8461.4002,403496193.4618,20087.2147,0.6333,1.4367,5.7757
3,8743.6448,493092236.7858,22205.6803,0.5689,1.4375,5.5819
4,6740.8869,268666591.9148,16391.0522,0.7173,1.5747,7.2549
Mean,7739.067,359566198.5365,18682.1378,0.6465,1.4052,5.7123
SD,1019.919,116161119.5001,3247.141,0.0482,0.116,0.9235


In [9]:
tuned_ct = tune_model(ct)


Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,6735.0191,194042045.2594,13929.8975,0.6463,1.2588,4.1049
1,8866.9409,448490226.2454,21177.5878,0.6473,1.5225,6.7084
2,9629.4212,449442916.2051,21200.0688,0.5915,1.4013,4.8108
3,9851.7653,562672432.8246,23720.7174,0.5081,1.6142,9.6072
4,7328.2958,282858444.8357,16818.396,0.7023,1.607,10.4149
Mean,8482.2884,387501213.0741,19369.3335,0.6191,1.4808,7.1292
SD,1242.8988,131619108.8706,3511.4288,0.0656,0.1349,2.5153


In [10]:
predict_model(tuned_ct)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,7521.9461,305302722.6848,17472.914,0.6544,1.3725,6.1858


Unnamed: 0,C,Cr,Mn,Si,Ni,Co,Mo,W,Nb,Al,...,Temper1_640.0,Temper1_650.0,Temper1_660.0,Temper1_760.0,Temper1_765.0,Temper1_770.0,Temper1_780.0,Temper1_790.0,CT_RT,Label
0,-0.199037,-1.349402,-1.888111,-1.600327,1.995061,-1.789161,-1.056545,-1.068901,-0.874028,2.054660,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.400000,4210.126634
1,0.026261,0.265308,-0.371479,0.132361,0.116413,0.193795,0.822893,-0.380538,-0.171408,0.290397,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,779.099976,1809.535954
2,0.039212,0.537021,-0.475419,0.046641,0.005797,0.193795,0.134737,1.237683,-0.200027,0.268566,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,65363.398438,63970.194490
3,0.042450,0.396882,-0.407071,-0.020594,0.224327,0.193795,0.253315,1.289300,-0.204075,0.278529,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1983.000000,1472.721238
4,-0.199037,-1.373341,-1.920680,-1.715455,2.148574,-2.453279,-1.295636,-1.068901,-2.089392,0.673454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125.500000,2081.159316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,0.253332,1.357091,0.234148,0.063638,0.801256,0.276735,1.146110,0.863248,-0.320785,0.290397,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2109.100098,958.480066
387,0.854278,-1.208273,0.835199,0.370874,-1.455654,-0.102638,-1.279376,-1.068901,0.694992,-1.251391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4892.700195,7468.803627
388,-3.283520,-1.362493,-1.939678,-2.676654,2.114268,-1.789161,-1.143006,-1.068901,-1.999748,0.548775,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.900002,-801.349707
389,0.058644,1.357091,0.234148,0.012871,0.314892,0.193795,0.103921,1.275483,-0.204075,0.282495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,51539.000000,59683.705289


In [11]:
get_config('X_train')

Unnamed: 0,C,Cr,Mn,Si,Ni,Co,Mo,W,Nb,Al,...,Normal_980.0,Normal_not_available,Temper1_640.0,Temper1_650.0,Temper1_660.0,Temper1_760.0,Temper1_765.0,Temper1_770.0,Temper1_780.0,Temper1_790.0
305,0.058644,0.213272,-0.441713,-0.037210,-0.074873,0.193795,0.844477,-0.380538,-0.183734,0.252484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
121,-2.263077,-1.226938,1.041403,2.656355,-1.419049,0.165050,-1.263160,-1.068901,1.138778,-1.414825,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353,-3.283520,-1.362493,-1.939678,-2.676654,2.114268,-1.789161,-1.143006,-1.068901,-1.999748,0.548775,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.993878,-1.180741,-0.275220,-1.116960,-1.505613,-2.928905,-1.311942,-1.068901,2.056243,-1.787149,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
392,0.042450,0.396882,-0.407071,-0.020594,0.224327,0.193795,0.253315,1.289300,-0.204075,0.278529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,-1.339115,-1.393441,-1.869644,-1.541984,2.133018,-2.558031,-1.311126,-1.068901,-2.093195,1.092272,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
311,-0.199037,-1.349402,-1.888111,-1.600327,1.995061,-1.789161,-1.056545,-1.068901,-0.874028,2.054660,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,0.042450,0.239168,-0.508200,0.046641,-0.042444,0.193795,0.822893,-0.380538,-0.228148,0.282495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
205,-1.236042,-1.205146,1.149415,2.090198,-1.415394,0.400444,-1.222822,-1.068901,0.955512,-1.669947,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
