# Validate the outputs! Regression!

In [1]:
# !pip install xgboost
# !pip install shap

In [14]:
import pandas as pd
from rdkit import Chem
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import defaultdict
from sklearn.metrics import mean_squared_error
import shap
import multiprocessing as mp


## Get Data

In [3]:
dtypes=defaultdict(lambda:"float")
dtypes['cmp1ID'] = 'category'
dtypes['cmp1name'] = 'category'
dtypes['SMILES'] = 'category'
df = pd.read_csv("../../data/IL/model_data_with_descriptors.csv", dtype=dtypes, na_values=["na"])
df = df.rename(columns={'Viscosity[Liquid]/Pa&#8226;s':'Viscosity', 
                   'Electrical_conductivity[Liquid]/S/m':"ElecConductivity", 
                   'Specific_density[Liquid]/kg/m3': 'Density'
                  })
print(df.info())
print(df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3315 entries, 0 to 3314
Columns: 5674 entries, Temperature/K to chiralPhMoment
dtypes: category(3), float64(5671)
memory usage: 143.5 MB
None
Index(['Temperature/K', 'Pressure/kPa', 'Density', 'cmp1ID', 'cmp1name',
       'SMILES', 'Viscosity', 'ElecConductivity', 'MW', 'AMW',
       ...
       's1_numAroBonds', 's2_numAroBonds', 's3_numAroBonds', 's4_numAroBonds',
       's34_size', 's34_relSize', 's34_phSize', 's34_phRelSize',
       'chiralMoment', 'chiralPhMoment\r'],
      dtype='object', length=5674)


In [4]:
replaces= {}
for col in df.columns:
    if '[' in col or ']' in col or '<' in col:
        replaces[col] = col.replace('[','_').replace(']','_').replace('<','_')
df = df.rename(columns=replaces)        

In [5]:
outputs = ['Viscosity', 'ElecConductivity', 'Density']
irrelevant = ['cmp1ID', 'cmp1name', 'SMILES']
inputs = list(set(df.columns).difference(set(outputs)).difference(set(irrelevant)))

In [6]:
X = df[inputs]
y = df[outputs]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
dtrain_reg = xgb.DMatrix(X_train, y_train)
dtest_reg = xgb.DMatrix(X_test, y_test)

In [None]:
mp.cpu_count() // 2

In [None]:
n = 1000
params = {
#     "eta": 0.05,
#     "max_depth": 10,
    "tree_method": "gpu_hist",
    "gpu_id":0,
    "objective": "reg:squarederror",
    "n_jobs":mp.cpu_count() // 2
#     "device": "cuda",
#     "enable_categorical":True,
#     "max_cat_to_onehot":1, 
}
# params = {
#     "objective": "reg:squarederror", 
#     "tree_method": "gpu_hist", 
#     "gpu_id":0, 
#     "max_cat_to_onehot":1, 
#     "enable_categorical":True,
#     "n_estimators":n,
#     "predictor": "gpu_predictor",
# }
# params = {"objective": "reg:squarederror", "tree_method": "hist"}

# model = xgb.XGBRegressor(**params)
# model.fit(X_train, y_train)
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

In [None]:
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

In [None]:
print(r2_score(y_test, preds))

In [None]:
model.set_param({"device": "cuda"})
shap_values = model.predict(dtrain_reg, pred_contribs=True)

In [None]:
# shap_interaction_values = model.predict(dtrain_reg, pred_interactions=True)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)[0]
print(shap_values)

shap.force_plot(
    explainer.expected_value,
    shap_values[0, :],
    X_train[0, :],
    matplotlib=True,
)