# Jupyter notebook to demonstrate functionality of the MIxgboostErrPDF wrapper

In [None]:
#  from RootInteractive.MLpipeline.test_MIxgboostErrPDF import *
import pandas as pd
import numpy as np
import pickle
from RootInteractive.InteractiveDrawing.bokeh.bokehDrawSA import *
from RootInteractive.MLpipeline.NDFunctionInterface import *
#from bokeh.io import output_notebook
from RootInteractive.MLpipeline.RandoForestErrPDF import *
from RootInteractive.MLpipeline.MIForestErrPDF import *
from RootInteractive.MLpipeline.local_linear_forest import LocalLinearForestRegressor
import pdb;
import sys
import os;
import xgboost as xgb
from RootInteractive.MLpipeline.MIxgboostErrPDF import *
from scipy.signal import medfilt

In [None]:
from xgboost import XGBRegressor

## Generate input data

In [None]:
def generateF1(nPoints, n, outFraction,stdIn):
    """
    Generate random panda+tree random vectors A,B,C,D  - A and C used to define function
        * generate function value = 2*A*sin(n*6.28*C)
        * generate noise vector
        * calculate local gradient of function
    """
    df = pd.DataFrame(np.random.random_sample(size=(nPoints, 4)), columns=list('ABCD'))
    df["B"]=df["B"]+0.5
    df["noise"] = np.random.normal(0, stdIn, nPoints)
    #df["noise"]+=np.roll(df["noise"],1)   - adding correlated noise?
    df["noise"] += (np.random.random(nPoints)<outFraction)*np.random.normal(0, 2, nPoints)
    df["csin"] = np.sin(n*6.28 * df["C"])
    df["ccos"] = np.cos(n*6.28 * df["C"])
    df["valueOrig"] = 2*df["A"]*df["csin"]
    df["value"] = df["valueOrig"] + df["noise"]
    df["gradA"] = df["csin"]
    df["gradC"] = df["A"]*df["ccos"]*n*6.28
    df["grad"]  =np.sqrt(df["gradA"]**2+df["gradC"]**2)
    # df["value"] = df["valueOrig"] + df["noise"]
    return df

* generate data and reference data

In [None]:
%%time
nPoints=500000; outFraction=0.0; n_jobs=16;stdIn=0.2; n=2
df   =generateF1(nPoints, n=n, outFraction=outFraction,stdIn=stdIn)
dfRef=generateF1(nPoints, n=n, outFraction=outFraction,stdIn=stdIn)

## Make addaptive xgboost fitter and add xgboost for the reducible /irreducible errorr estimate

In [None]:
%%time
varFit = 'value'
variableX = ['A', "B", "C"]
paramTrain = {'learning_rate':0.2, 'max_depth':10,"n_estimators":200,"subsample":0.50,"coeff_learning_rate":0.2,"max_learning_rate":0.2}
xgbErrPDF=MIxgboostErrPDF(paramTrain)
xgbErrPDF.fit3Fold(df[variableX].to_numpy(),df["value"].to_numpy(),df["value"])

In [None]:
%%time
xgbErrPDF.fitReducible()

## Export residuals, error estimates, pulls  

In [None]:
%%time
if xgbErrPDF.earlyStop<=0:
    ir=(0,0)
    xgbErrPDF.earlyStop=xgbErrPDF.regXGBFac[0]._get_iteration_range(ir)[1]
earlyStop=xgbErrPDF.earlyStop

for iSample in [0,1,2]: 
    x=xgbErrPDF.predictStat(df[variableX].to_numpy(),{},iSample,0,(earlyStop//2,earlyStop),0)
    y=xgbErrPDF.predictStat(df[variableX].to_numpy(),{},iSample,1,(earlyStop,earlyStop+95),0)
    df[f"stdN{iSample}"]=x["stdN"]
    df[f"stdNR{iSample}"]=y["stdN"]
    xP=xgbErrPDF.regXGBFac[iSample].predict(df[variableX].to_numpy())
    yP=xgbErrPDF.regXGBFacRed[iSample].predict(df[variableX].to_numpy())
    df[f"meanR{iSample}"]=yP
    df[f"deltaOrigN{iSample}"]=xP-df["valueOrig"]
    df[f"deltaOrigNR{iSample}"]=yP-df["valueOrig"]

In [None]:
df["stdNR"]=(df["stdNR0"]+df["stdNR1"]+df["stdNR2"])/3.
df["meanR"]=(df["meanR0"]+df["meanR1"]+df["meanR2"])/3.

In [None]:
stdDiff0={}
stdDiffOrig={}

for i in [0,1,2]: 
    stdDiff0[i]=(df[f"meanR{i}"]-df[f"meanR{(i+1)%3}"]).std()/np.sqrt(2.)
    stdDiffOrig[i]=(df[f"meanR{i}"]-df[f"valueOrig"]).std()
    df[f"stdNorm{i}"]=df[f"stdNR{i}"]*stdDiff0[i]

stdDiff0[3]= ((stdDiff0[0]+stdDiff0[1]+stdDiff0[2])/3)/np.sqrt(3.)
df[f"stdNorm"]=df[f"stdNR"]*stdDiff0[3]
#std01=(df[f"meanR0"]-df[f"valueOrig"]).std()
print(stdDiff0,stdDiffOrig)

In [None]:
df["deltaOrigNR"]=(df["deltaOrigNR0"]+df["deltaOrigNR1"]+df["deltaOrigNR2"])/3
df["pullNR0"]=df["deltaOrigNR0"]/(df["stdNR0"]*stdDiff0[0])
df["pullNR"]=df["deltaOrigNR"]/(df["stdNR"]*stdDiff0[3])
#
df["deltaNR0_1"]=df["meanR0"]-df["meanR1"]
df["deltaNR0_12"]=df["meanR0"]-(df["meanR1"]+df["meanR2"])*0.5
df["pullNR0_1"]=(df["deltaNR0_1"])/(np.sqrt(2.*3.)*df["stdNorm"])
df["pullNR0_12"]=df["deltaNR0_12"]/(np.sqrt(1.5*3.)*df["stdNorm"])

## Static visualization

In [None]:
df.query("A>0.9").plot.scatter(x=["stdNR0"],y=["stdNR1"],figsize=(8, 5),grid=True)
df.query("A>0").plot.scatter(x=["stdNR"],y=["deltaOrigNR"],figsize=(8, 5),grid=True)

# Make interactive visulization usig RootInteractive

* make widgets

In [None]:
parameterArray = [
    {"name": "colorZT0", "value":"A", "options":["A","B","C","csin","stdNR0","stdNR"]},
    {"name": "markerSize", "value":4, "range":[0, 15]},
    {"name": "legendFontSize", "value":"11px", "options":['3px','4px','5px','7px',"9px", "11px", "13px", "15px", "17px", "19px"]},
    {"name": "legendLocation", "value":"top_right", "options":["top_right","top_left", "bottom_right","bottom_left"]},
    {"name": "nPoints", "range":[0, 2000], "value": 200}
]

widgetParams = [
    ['select',["colorZT0"], {"callback": "parameter", "default": 0}],
    ['slider',["markerSize"], {"callback": "parameter"}],
    ['select',["legendFontSize"], {"callback": "parameter", "default": 2}],
    ['select',["legendLocation"], {"callback": "parameter", "default": 0}],
    ['slider', ['nPoints']],
    # variables
    ['range', ['A']],
    ['range', ['B']],
    ['range', ['C']],
    ['range', ['csin']],
    ['range', ['deltaOrigNR']],
    ['range', ['deltaOrigNR0']],
    ['range', ['stdNR']],
    ['range', ['stdNR0']],
    
]

widgetLayoutDescT0=[  [5,6,7,8], [9,10,11,12], {'sizing_mode':'scale_width',"legend_visible":True} ]
widgetLayoutDesc={
    "Selection": widgetLayoutDescT0,
    "Graphics": [[0,1,2,3,4], {'sizing_mode': 'scale_width'}]
}

In [None]:
histoArray = [
    # hisMC 
    {"name": "hisdeltaOrigNR0_1D", "variables": ["deltaOrigNR0"], "nbins": 100},
    {"name": "hisdeltaOrigNR_1D", "variables": ["deltaOrigNR"], "nbins": 100},
    {"name": "hisPullNR0_1D", "variables": ["pullNR0"], "nbins": 100,"range":[-6,6]},
    {"name": "hisPullNR_1D", "variables": ["pullNR"], "nbins": 100,"range":[-6,6]},
    #
    {"name": "hisdeltaOrigNR0x2D", "variables": ["deltaOrigNR0","stdNR0"], "nbins": [50,20],"axis": [0, 1],},
    {"name": "hisdeltaOrigNRx2D", "variables": ["deltaOrigNR","stdNR"], "nbins": [50,20],"axis": [0, 1],},
    {"name": "hisPullNR0x2D", "variables": ["pullNR0","stdNR0"], "nbins": [50,20],"axis": [0, 1],},
    {"name": "hisPullNRx2D", "variables": ["pullNR","stdNR"], "nbins": [50,20],"axis": [0, 1],},
    # histoFold
    {"name": "hisdeltaNR0_1_1D", "variables": ["deltaNR0_1"], "nbins": 100},
    {"name": "hisdeltaNR0_12_1D", "variables": ["deltaNR0_12"], "nbins": 100},
    {"name": "hisPullNR0_1_1D", "variables": ["pullNR0_1"], "nbins": 100,"range":[-6,6]},
    {"name": "hisPullNR0_12_1D", "variables": ["pullNR0_12"], "nbins": 100,"range":[-6,6]},
]

In [None]:
figureArray = [
    # tab 0: deltaOrig 0-8
    [['C'], ['valueOrig'], {"colorZvar":"colorZT0"}],
    [['C'], ['deltaOrigNR+valueOrig'], {"colorZvar":"colorZT0","errY":"stdNorm"}],
    [['C'], ['deltaOrigNR0'], {"colorZvar":"colorZT0","errY":"stdNorm0"}],
    [['C'], ['deltaOrigNR'], {"colorZvar":"colorZT0","errY":"stdNorm"}],
    [['deltaOrigNR0'], ['hisdeltaOrigNR0_1D'],{"size":"markerSize"}],
    [['deltaOrigNR'], ['hisdeltaOrigNR_1D']],
    [['pullNR0'], ['hisPullNR0_1D'],{"size":"markerSize"}],
    [['pullNR'], ['hisPullNR_1D']],
    ["tableHisto", {"rowwise": True,"exclude": r".*2D"}],
    # tab 1: pullOrig
    [['C'], ['pullNR0'], {"colorZvar":"colorZT0"}],
    [['C'], ['pullNR'], {"colorZvar":"colorZT0"}],
    [['bin_center_1'],['std'],  {"source": "hisdeltaOrigNR0x2D_0"}],
    [ ['bin_center_1'],['std'], {"source": "hisdeltaOrigNRx2D_0"}],
    [['bin_center_1'],['std'],  {"source": "hisPullNR0x2D_0"}],
    [['bin_center_1'],['std'],  {"source": "hisPullNRx2D_0"}],
    # tab2: deltaFold
    [['C'], ['meanR'], {"colorZvar":"colorZT0"}],
    [['C'], ['meanR0'], {"colorZvar":"colorZT0","errY":"stdNorm"}],
    [['C'], ['deltaNR0_1'], {"colorZvar":"colorZT0","errY":"sqrt(2.)*stdNorm0"}],
    [['C'], ['deltaNR0_12'], {"colorZvar":"colorZT0","errY":"sqrt(1.5)*stdNorm0"}],
    [['deltaNR0_1'], ['hisdeltaNR0_1_1D'],{"size":"markerSize"}],
    [['deltaNR0_12'], ['hisdeltaNR0_12_1D']],
    [['pullNR0_1'], ['hisPullNR0_1_1D'],{"size":"markerSize"}],
    [['pullNR0_12'], ['hisPullNR0_12_1D']],
    ["tableHisto", {"rowwise": True,"exclude": r".*2D"}],
    #
    {"size":"markerSize","legend_options": {"label_text_font_size": "legendFontSize" , "location":"legendLocation"}}
    
]    

tooltips = [("A","@A"),("C","@C"),("stdNR","@stdNR"),("pullNR","@pullNR")]
#
figureLayoutDescT0=[ [0,1,  {'plot_height':110, "commonX":0}],     [2,3,  {'plot_height':110, "commonX":3}],    
                    [4,5,  {'plot_height':110, "commonX":4}],  [6,7,  {'plot_height':110, "commonX":6}], [8 , {'plot_height':30}],   {'sizing_mode':'scale_width',"legend_visible":True}]
figureLayoutDescT1=[ [9,10,  {'plot_height':150, "commonX":9}],     [11,12,  {'plot_height':150, "commonX":11}],    
                    [13,14,  {'plot_height':150, "commonX":13}],     {'sizing_mode':'scale_width',"legend_visible":True}]
figureLayoutDescT2=[ [15,16,  {'plot_height':110, "commonX":15}],     [17,18,  {'plot_height':110, "commonX":17}],    
                    [19,20,  {'plot_height':110, "commonX":19}],  [21,22,  {'plot_height':110, "commonX":21}],  [23 , {'plot_height':30}],  {'sizing_mode':'scale_width',"legend_visible":True}]

#
figureLayoutDesc={
    "xgboost ΔMC tab": figureLayoutDescT0,
    "xgboost pullMC tab": figureLayoutDescT1,
    "xgboost ΔFold tab": figureLayoutDescT2,
}
output_file(f"MIxgboostErrPDF_n{n}_stdIn{stdIn}_nPoints{nPoints}.html")

arrayCompressionRelative8=[(".*",[("relative",8), ("code",0), ("zip",0), ("base64",0)])]
#figC=bokehDrawSA.fromArray(df, "A>0", figureArray, widgetParams,layout=figureLayoutDesc,tooltips=tooltips,sizing_mode='scale_width',widgetLayout=widgetLayoutDesc,nPointRender=2000,
#                           rescaleColorMapper=True,arrayCompression=arrayCompressionRelative8,histogramArray=histoArray,parameterArray=parameterArray)
figC=bokehDrawSA.fromArray(df, "stdNR0>0.5&stdNR>0.6", figureArray, widgetParams,layout=figureLayoutDesc,tooltips=tooltips,sizing_mode='scale_width',widgetLayout=widgetLayoutDesc,nPointRender="nPoints",
                           rescaleColorMapper=True,arrayCompression=arrayCompressionRelative8,parameterArray=parameterArray,histogramArray=histoArray)


In [None]:
df["deltaNR0_12"]

# Backup