## Data Prepartion

Prepare train, test data from https://github.com/dmlc/xgboost/tree/master/demo/data

Over sample y=1 in train with beta=2. (odds ratio doubled after oversampling)

In [1]:
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
import numpy as np
import pandas as pd
import json

import warnings
warnings.simplefilter(action='ignore')
warnings.filterwarnings("ignore")

np.random.seed(1234)

dtrain = load_svmlight_file('agaricus.txt.train')
dtest = load_svmlight_file('agaricus.txt.test')

fnames=['f'+str(i) for i in range(0,126)]  # same as feature naming by xgboost
train = pd.DataFrame(dtrain[0].toarray(),columns=fnames).assign(y=dtrain[1])
test = pd.DataFrame(dtest[0].toarray()).assign(y=dtest[1])

# oversampling
beta=2
train['rnd']=np.random.uniform(0,1,train.shape[0])
train_sampled=train[(train['rnd']<1/beta) | (train['y']==1.0)]
print('freq of y in train:')
print(train.groupby(['y']).size())
print('freq of y in sampled train:')
print(train_sampled.groupby(['y']).size())
print('freq of y in test:')
print(test.groupby(['y']).size())

freq of y in train:
y
0.0    3373
1.0    3140
dtype: int64
freq of y in sampled train:
y
0.0    1652
1.0    3140
dtype: int64
freq of y in test:
y
0.0    835
1.0    776
dtype: int64


## Method1：Adjustprobability

For each output $p$, the adjusted probability is
$$
p^* = \frac{p}{p+(1-p)*\beta},
$$
$$
\frac{p}{1-p}/\beta = \frac{p^*}{1-p^*}.
$$

### Intercept only model

Fit a treee with ony a root node to show the adustment for oversampling.

In [2]:
param = {'max_depth':1, 'learning_rate':1, 'objective':'binary:logistic','n_estimators':1}
xgb_model = xgb.XGBClassifier(eval_metric='logloss',**param)
xgb_model.fit(train_sampled.iloc[:,:-2],train_sampled.iloc[:,-2])

pred = lambda df: pd.Series(xgb_model.predict_proba(df)[:,1])
p_test = pred(test.iloc[:,:-1])
p_train = pred(train.iloc[:,:-2])
p_train_sampled = pred(train_sampled.iloc[:,:-2])

print(f'test: mean(p)={p_test.mean():.3f}, mean(y)={test.iloc[:,-2].mean():.3f}')
print(f'train: mean(p)={p_train.mean():.3f}, mean(y)={train.iloc[:,-2].mean():.3f}')
print(f'train_sample: mean(p)={p_train_sampled.mean():.3f}, mean(y)={train_sampled.iloc[:,-2].mean():.3f}')
print(f'train_sample: mean(adjusted p)={p_train_sampled.map(lambda x:x/(x+(1-x)*beta)).mean():.3f}')

test: mean(p)=0.532, mean(y)=0.386
train: mean(p)=0.539, mean(y)=0.482
train_sample: mean(p)=0.631, mean(y)=0.655
train_sample: mean(adjusted p)=0.527


### Three-tree model
The average of adjusted p is close to the average of y but not the same.

In [3]:
param = {'base_score':0.1,'max_depth':2, 'learning_rate':.3, 'objective':'binary:logistic','n_estimators':3}
xgb_model = xgb.XGBClassifier(eval_metric='logloss',**param)
xgb_model.fit(train_sampled.iloc[:,:-2],train_sampled.iloc[:,-2])

pred = lambda df: pd.Series(xgb_model.predict_proba(df)[:,1])
p_test = pred(test.iloc[:,:-1])
p_train = pred(train.iloc[:,:-2])
p_train_sampled = pred(train_sampled.iloc[:,:-2])

print(f'test: mean(p)={p_test.mean():.3f}, mean(y)={test.iloc[:,-2].mean():.3f}')
print(f'train: mean(p)={p_train.mean():.3f}, mean(y)={train.iloc[:,-2].mean():.3f}')
print(f'train_sample: mean(p)={p_train_sampled.mean():.3f}, mean(y)={train_sampled.iloc[:,-2].mean():.3f}')
print(f'train_sample: mean(adjusted p)={p_train_sampled.map(lambda x:x/(x+(1-x)*beta)).mean():.3f}')

test: mean(p)=0.448, mean(y)=0.386
train: mean(p)=0.446, mean(y)=0.482
train_sample: mean(p)=0.561, mean(y)=0.655
train_sample: mean(adjusted p)=0.455


## Method 2: adjust model base_score

### Fit and save model

In [4]:
param = {'base_score':0.1,'max_depth':2, 'learning_rate':.5, 'objective':'binary:logistic','n_estimators':3}
xgb_model = xgb.XGBClassifier(eval_metric='logloss',**param)
xgb_model.fit(train_sampled.iloc[:,:-2],train_sampled.iloc[:,-2])
xgb_model.save_model('xgb_model.json')

### Load and mondify model file

In [5]:
with open('xgb_model.json') as json_file:
    model_json = json.load(json_file)

learner_model_param = model_json['learner']['learner_model_param']
base_score = float(learner_model_param['base_score'])
learner_model_param['base_score'] = str(round(base_score/(base_score+(1-base_score)*beta),8))

with open("xgb_model1.json","w") as outfile:
    json.dump(model_json,outfile)

### Load modified model file

In [6]:
import math
from scipy.special import logit,expit
xgb_model1 = xgb.XGBClassifier()
xgb_model1.load_model("xgb_model1.json")

p1 = pd.Series(xgb_model1.predict_proba(train.iloc[:,:-2])[:,1])
p = pd.Series(xgb_model.predict_proba(train.iloc[:,:-2])[:,1])

print(f'Training data: aveg(adj p)={p1.mean():.4f},avg(p)={p.mean():.4f}')

Training data: aveg(adj p)=0.4668,avg(p)=0.5017


## Method 3: Calibration by Logistic Regresion
### Load xgb model and model json file

Trees are outputs by xgb_model as
```python
xgb_model0.get_booster().get_dump()
```
Leaf outputs are in the following lists from json
```python
model_json['learner']['gradient_booster']['model']['trees']['split_conditions']
```

In [7]:
xgb_model0 = xgb.XGBClassifier()
xgb_model0.load_model("xgb_model.json")

trees = xgb_model0.get_booster().get_dump()
for tree in trees:
    print(tree)
    
with open("xgb_model.json") as json_file:
    model_json = json.load(json_file)
model = model_json['learner']['gradient_booster']['model']

for tree in model['trees']:
    print(tree['split_conditions']) 

0:[f28<0.5] yes=1,no=2,missing=1
	1:[f55<0.5] yes=3,no=4,missing=3
		3:leaf=4.78408623
		4:leaf=0.201096877
	2:[f108<0.5] yes=5,no=6,missing=5
		5:leaf=-0.395252198
		6:leaf=4.13344908

0:[f59<0.5] yes=1,no=2,missing=1
	1:[f66<0.5] yes=3,no=4,missing=3
		3:leaf=0.131180301
		4:leaf=4.32168531
	2:leaf=-6.03941679

0:[f28<0.5] yes=1,no=2,missing=1
	1:[f22<0.5] yes=3,no=4,missing=3
		3:leaf=0.435074449
		4:leaf=-1.07391596
	2:[f38<0.5] yes=5,no=6,missing=5
		5:leaf=0.279891402
		6:leaf=-0.488417298

[0.5, 0.5, 0.5, 4.784086, 0.20109688, -0.3952522, 4.133449]
[0.5, 0.5, -6.039417, 0.1311803, 4.3216853]
[0.5, 0.5, 0.5, 0.43507445, -1.073916, 0.2798914, -0.4884173]


### Alignment

Score unbiased data  with the oversampled data trained model ```xgb_model0 ``` to have $p_{train}$, and
$$
\text{LO} = \ln \frac{p}{1-p}.
$$

Fit logistic regression model unbiased data as
$$
\text{Prob}[Y=1] = \frac{\exp(a\text{LO}+b)}{1+\exp{(a\text{LO}+b)}}
$$

In [8]:
from sklearn.linear_model import LogisticRegression
from scipy.special import logit,expit

# prepare log-odds as feature and tru y as label.
p_train = xgb_model0.predict_proba(train.iloc[:,:-2])[:,1]
lr_df = pd.DataFrame({'LO':pd.Series(p_train).map(logit)})
lr_df['y'] = train.iloc[:,-2]

#fit logistic regresion to find y = expit(a*LO+b)
lr = LogisticRegression()
lr.fit(lr_df[['LO']],lr_df['y'])

#check avg(p)=avg(y)
p_train_aligned = lr.predict_proba(lr_df[['LO']])[:,1]
print('Training data:')
print(f'avg(p)={p_train.mean():.4f}, avg(aligned p)={p_train_aligned.mean():.4f}, avg(y)={lr_df["y"].mean():.4f}')

Training data:
avg(p)=0.5017, avg(aligned p)=0.4821, avg(y)=0.4821


### Modify model json file

Modify
```python
base_score = model_json['learner']['learner_model_param']['base_score']
```
to
```python
expit(a*logit(base_score)+b).
```

For each ```tree``` in
```python
model_json['learner']['gradient_booster']['model']['trees']
```
modify
```python
tree['split_conditions']
```
to scale leaf outputs by $a$.

In [9]:
a,b = lr.coef_[0][0], lr.intercept_[0]

with open('xgb_model.json') as json_file:
    model_json = json.load(json_file)
    
learner_model_param = model_json['learner']['learner_model_param']
base_score = float(learner_model_param['base_score'])
base_score_adj = expit(logit(base_score)*a+b)
learner_model_param['base_score'] =  str(round(base_score_adj,8))

model = model_json['learner']['gradient_booster']['model']
for tree in model['trees']:
    sc = tree['split_conditions']
    for i in range(len(sc)//2,len(sc)):
        sc[i]*=a
    tree['split_conditions'] = sc
    
with open("xgb_model2.json","w") as outfile:
    json.dump(model_json,outfile)
    
xgb_model2 = xgb.XGBClassifier()
xgb_model2.load_model("xgb_model2.json")

p2 = xgb_model2.predict_proba(train.iloc[:,:-2])[:,1]
print(f'Training data: avg(adjusted p)={p2.mean():.4f}')

Training data: avg(adjusted p)=0.4821


## Python class for oversampling adjustment

In [10]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

class OverSamplingAdjust:
    def __init__(self,model_json=None,beta=1):
        self.model_json= model_json
        self.xgb_model = xgb.XGBClassifier()
        self.xgb_model.load_model(self.model_json)
        self.beta=beta
        self.a=1.0
        self.b=0.0

    def p_adjust(self,p):
        return expit(self.a * logit(p) + self.b)
    
    def _adjust(self,proba):
        p_adj=pd.Series(proba[:,1]).apply(lambda x: x/(x+(1-x)*beta))
        return pd.DataFrame({0:1-p_adj,1:p_adj}).values
    
    def predict(self,X):
        proba=self.xgb_model.predict_proba(X)
        if self.beta != 1:
            proba=self._adjust(proba)
        return proba

    
    def _align(self,X,y):
    
        #prepare feature and label
        self.xgb_model.load_model(self.model_json)
        p = pd.Series(self.xgb_model.predict_proba(X)[:,1])
        LO=pd.DataFrame({'LO':p.apply(logit)})

        # fit logistic regression
        lr=LogisticRegression()
        lr.fit(LO,y)
        self.a,self.b=lr.coef_[0][0],lr.intercept_[0]
        
    def _update_json(self,model_json_updated='xgb_model_adjusted.json'):
        
        #load original model json file
        with open(self.model_json) as json_file:
            model_json = json.load(json_file)
        
        #update base_score
        param = model_json['learner']['learner_model_param']
        base_score = float(param['base_score'])
        base_score_adj = self.p_adjust(base_score)
        param['base_score'] = str(round(base_score_adj,8))
        
        #update leaf outputs
        model = model_json['learner']['gradient_booster']['model']
        for tree in model['trees']:
            sc=tree['split_conditions']
            for i in range(len(sc)//2,len(sc)):
                sc[i]*=self.a
            tree['split_conditions']=sc

        #save updated json to file
        with open(model_json_updated, "w") as outfile:
            json.dump(model_json, outfile)
    

    def fit(self,X,y):
        
        assert self.model_json is not None, 'model_json file is None'        
        self._align(X,y)
        self._update_json()
        
        # update xgb_model by re-loading updated model_json
        self.xgb_model = xgb.XGBClassifier()   #MUST initialize xgb again
        self.xgb_model.load_model("xgb_model_adjusted.json")
        
        
X=train.iloc[:,:-2]
y=train.iloc[:,-2]
osa=OverSamplingAdjust("xgb_model.json")    
print(f'avg(p)    ={osa.predict(X)[:,1].mean():.4f}')
osa.fit(X,y)
print(f'avg(p_adj)={osa.predict(X)[:,1].mean():.4f}')

avg(p)    =0.5017
avg(p_adj)=0.4821
