### import required packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import xgboost   
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve

### read data

In [3]:
X=pd.read_csv("Kmer.csv",header=None)
y=pd.read_csv("label.csv",header=None)

In [4]:
data=pd.concat((X,y),axis=1)
data=np.array(data)
np.random.seed(612)
np.random.shuffle(data) 
data_X=data[:,0:-1]
data_y=data[:,-1]

In [6]:
transfer=StandardScaler()
Standard_X=transfer.fit_transform(data_X) 
Standard_Y=data_y

### train model

In [8]:
cv=LeaveOneOut()

In [10]:
estimator=XGBClassifier()
acc_val=cross_val_score(estimator,Standard_X,Standard_Y,scoring="accuracy",cv=cv)
print("the mean score of cross validation:{:.3f}".format(np.mean(acc_val)))

the mean score of cross validation:0.786


## adjust parameters

### 1. The number of trees( T )

In [None]:
param1=np.array(range(50,500,10))
param_grid={"n_estimators":param1}
grid_search=GridSearchCV(XGBClassifier(),param_grid,scoring="accuracy",cv=cv)
grid_search.fit(Standard_X,Standard_Y)
print("best parameters:{}".format(grid_search.best_params_))                        
print("best cross-validation score:{:.3f}".format(grid_search.best_score_))         
print("best estimator:\n{}".format(grid_search.best_estimator_))

### 2. Learning rate(R)

In [14]:
param2=np.array(range(10,52,2))/100
param_grid={"learning_rate":param2}
grid_search=GridSearchCV(XGBClassifier(n_estimators=60),param_grid,scoring="accuracy",cv=cv)
grid_search.fit(Standard_X,Standard_Y)
print("best parameters:{}".format(grid_search.best_params_))                        
print("best cross-validation score:{:.3f}".format(grid_search.best_score_))         
print("best estimator:\n{}".format(grid_search.best_estimator_))

best parameters:{'learning_rate': 0.18}
best cross-validation score:0.827
best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.18, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=60, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


### 3. The maximum depth of trees(D)

In [15]:
param3=np.array(range(3,10,1))
param_grid={"max_depth":param3}
grid_search=GridSearchCV(XGBClassifier(n_estimators=60,learning_rate=0.18),param_grid,scoring="accuracy",cv=cv)
grid_search.fit(Standard_X,Standard_Y)
print("best parameters:{}".format(grid_search.best_params_))                        
print("best cross-validation score:{:.3f}".format(grid_search.best_score_))         
print("best estimator:\n{}".format(grid_search.best_estimator_))

best parameters:{'max_depth': 6}
best cross-validation score:0.827
best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.18, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=60, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


### 4. Regularization parameter(λ)

In [17]:
param4=np.array(range(0,21,1))/10
param_grid={"reg_lambda":param4}
grid_search=GridSearchCV(XGBClassifier(n_estimators=60,learning_rate=0.18,max_depth=6),param_grid,scoring="accuracy",cv=cv)
grid_search.fit(Standard_X,Standard_Y)
print("best parameters:{}".format(grid_search.best_params_))                        
print("best cross-validation score:{:.3f}".format(grid_search.best_score_))         
print("best estimator:\n{}".format(grid_search.best_estimator_))

best parameters:{'reg_lambda': 1.0}
best cross-validation score:0.827
best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.18, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=60, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1.0, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


### best parameter value:T=60  R=0.18 D=6   λ=1.0

## Evaluation Model

In [20]:
Predict_Y=[]   
Prob_Y=[]       
Really_Y=Standard_Y  
for train_index,test_index in cv.split(Standard_X):
    estimator=XGBClassifier(n_estimators=60,learning_rate=0.18,max_depth=6,reg_lambda=1.0)
    estimator.fit(Standard_X[train_index],Standard_Y[train_index])
    Predict_Y.append(estimator.predict(Standard_X[test_index])[0])
    Prob_Y.append(estimator.predict_proba(Standard_X[test_index]))
Prob_Y=np.array(Prob_Y)  
Prob_Y=Prob_Y.reshape(173,2)  
import pandas as pd
Prob_Y=pd.DataFrame(Prob_Y)
Prob_Y.columns=["0","1"]
ACC=accuracy_score(Really_Y,Predict_Y)
Recall=recall_score(Really_Y,Predict_Y)
Precision=precision_score(Really_Y,Predict_Y)
F1_score=f1_score(Really_Y,Predict_Y)
AUC=roc_auc_score(Really_Y,np.array(Prob_Y["1"]))
print("ACC:",ACC)
print("Precision:",Precision)
print("Recall:",Recall)
print("f1_score:",F1_score)
print("AUROC:",AUC)

ACC: 0.8265895953757225
Precision: 0.8089887640449438
Recall: 0.8470588235294118
f1_score: 0.8275862068965517
AUROC: 0.8643048128342247


## save model

In [21]:
import joblib

In [22]:
estimator=XGBClassifier(n_estimators=60,learning_rate=0.18,max_depth=6,reg_lambda=1.0)
estimator.fit(Standard_X,Standard_Y)
acc_val=cross_val_score(estimator,Standard_X,Standard_Y,scoring="accuracy",cv=cv)
#print("the mean score of cross validation:{:.3f}".format(np.mean(acc_val)))
joblib.dump(estimator,"XGBoost+Kmer.pkl")

['XGBoost+Kmer.pkl']