In [None]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import catboost
from tqdm.autonotebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
def load_data(A, base_path = "../../../DATA/all_o/"):
    all_files = glob.glob(base_path + "*" + str(A) + "*.csv")
    df = pd.concat((pd.read_csv(f) for f in all_files))
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    train, val = train_test_split(train, test_size=0.2, random_state=42)
    return train, val, test


In [None]:
input_cols = ["Ct1S0pp", "Ct1S0np", "Ct1S0nn", "Ct3S1", "C1S0", "C3P0",
    "C1P1", "C3P1", "C3S1", "CE1", "C3P2", "c1", "c2", "c3",
    "c4", "cD", "cE",]
target_cols = ["Energy ket", "Rch"]
fidelity_col: str = "emax"
neutron_number_col: str = "N"
proton_number_col: str = "Z"


In [12]:
def fit_models(train, val, input_cols, target_cols, target_idx=0, n_models=10):
    random_seed = 42
    models = []
    x_cols = input_cols + [fidelity_col]
    for model_idx in range(n_models):
        model = catboost.CatBoostRegressor(random_seed=random_seed * model_idx)
        model.fit(train[x_cols], train[target_cols[target_idx]], eval_set=(val[x_cols], val[target_cols[target_idx]]), verbose=1000)
        models.append(model)
    return models

def predict_models(models, test, input_cols, target_cols, target_idx=0):
    x_cols = input_cols + [fidelity_col]
    predictions = [model.predict(test[x_cols]) for model in models]
    return predictions

In [None]:
perf = {"A": [], "RMSE": [], "std": []}
idx = 0
for a in tqdm(range(12, 25)):
    train, val, test = load_data(a)

    models = fit_models(train, val, input_cols, target_cols, idx)
    emax8_test = test[test[fidelity_col] == 10]
    predictions = predict_models(models, emax8_test, input_cols, target_cols)
    perf["A"].append(a)
    rmses = [np.sqrt(np.mean((predictions[i] - emax8_test[target_cols[idx]])**2)) for i in range(len(predictions))]
    perf["RMSE"].append(np.mean(rmses))
    perf["std"].append(np.std(rmses))


  0%|          | 0/13 [00:00<?, ?it/s]

Learning rate set to 0.049378
0:	learn: 9.2672147	test: 9.2646135	best: 9.2646135 (0)	total: 1.42ms	remaining: 1.42s
999:	learn: 0.1716061	test: 1.5351940	best: 1.5351940 (999)	total: 813ms	remaining: 0us

bestTest = 1.535194017
bestIteration = 999

Learning rate set to 0.049378
0:	learn: 9.2641930	test: 9.2616726	best: 9.2616726 (0)	total: 946us	remaining: 945ms
999:	learn: 0.1730778	test: 1.5410600	best: 1.5410600 (999)	total: 811ms	remaining: 0us

bestTest = 1.541060036
bestIteration = 999

Learning rate set to 0.049378
0:	learn: 9.2645645	test: 9.2744573	best: 9.2744573 (0)	total: 861us	remaining: 861ms
999:	learn: 0.1726299	test: 1.5174937	best: 1.5174937 (999)	total: 860ms	remaining: 0us

bestTest = 1.517493731
bestIteration = 999

Learning rate set to 0.049378
0:	learn: 9.2809799	test: 9.2915667	best: 9.2915667 (0)	total: 810us	remaining: 810ms
999:	learn: 0.1800471	test: 1.5159450	best: 1.5159450 (999)	total: 835ms	remaining: 0us

bestTest = 1.515945044
bestIteration = 999

Lea

In [14]:
pd.DataFrame(perf)

Unnamed: 0,A,RMSE,std
0,12,2.521292,0.089402
1,13,3.255775,0.139735
2,14,4.541385,0.09037
3,15,3.880342,0.162737
4,16,5.534949,0.146694
5,17,3.618528,0.1796
6,18,5.294383,0.106664
7,19,6.209191,0.126401
8,20,4.891912,0.290094
9,21,8.109181,0.296395


In [None]:
perf_rc = {"A": [], "RMSE": [], "std": []}
idx = 1
for a in tqdm(range(12, 25)):
    train, val, test = load_data(a)
    models = fit_models(train, val, input_cols, target_cols, idx)
    emax8_test = test[test[fidelity_col] == 10]
    predictions = predict_models(models, emax8_test, input_cols, target_cols)
    perf_rc["A"].append(a)
    rmses = [np.sqrt(np.mean((predictions[i] - emax8_test[target_cols[idx]])**2)) for i in range(len(predictions))]
    perf_rc["RMSE"].append(np.mean(rmses))
    perf_rc["std"].append(np.std(rmses))


  0%|          | 0/13 [00:00<?, ?it/s]

Learning rate set to 0.049378
0:	learn: 0.0761743	test: 0.0731613	best: 0.0731613 (0)	total: 1.91ms	remaining: 1.91s
999:	learn: 0.0025641	test: 0.0181416	best: 0.0181412 (998)	total: 1.28s	remaining: 0us

bestTest = 0.01814118894
bestIteration = 998

Shrink model to first 999 iterations.
Learning rate set to 0.049378
0:	learn: 0.0764563	test: 0.0735842	best: 0.0735842 (0)	total: 1.21ms	remaining: 1.21s
999:	learn: 0.0024964	test: 0.0176340	best: 0.0176320 (997)	total: 1.32s	remaining: 0us

bestTest = 0.01763200161
bestIteration = 997

Shrink model to first 998 iterations.
Learning rate set to 0.049378
0:	learn: 0.0761039	test: 0.0734301	best: 0.0734301 (0)	total: 1.21ms	remaining: 1.2s
999:	learn: 0.0024276	test: 0.0178350	best: 0.0178350 (999)	total: 1.23s	remaining: 0us

bestTest = 0.01783495754
bestIteration = 999

Learning rate set to 0.049378
0:	learn: 0.0764111	test: 0.0734552	best: 0.0734552 (0)	total: 1.7ms	remaining: 1.69s
999:	learn: 0.0023848	test: 0.0182665	best: 0.0182665

In [17]:
_df = pd.DataFrame(perf_rc)
_df["N"] = _df["A"] - 8
_df

Unnamed: 0,A,RMSE,std,N
0,12,0.049409,0.002981,4
1,13,0.049835,0.001505,5
2,14,0.028476,0.00088,6
3,15,0.023995,0.001117,7
4,16,0.030032,0.000988,8
5,17,0.023884,0.001153,9
6,18,0.033915,0.001277,10
7,19,0.03756,0.000836,11
8,20,0.024551,0.001358,12
9,21,0.040918,0.001383,13
