In [None]:
import random
import xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import log_loss, accuracy_score, roc_curve, auc

In [None]:
train_and_test()

In [None]:
def train_and_test():
    
    runs = 1000
    scores, accuracy_train_list, accuracy_test_list, importance_list = [], [], [], []
    confusion_matrix_train = np.zeros((2, 2))
    confusion_matrix_test = np.zeros((2, 2))

    for _ in range(runs):
        
        df = pd.read_csv("G:/LungCancerPredict/detected/feature_table.csv")
        df_1 = df[df["cancer_label"] == 1]
        df_0 = df[df["cancer_label"] == 0]
        df = pd.concat([pd.concat([df_1, df_1.sample(379), df_1.sample(378)]), df_0]) 

        y = df["cancer_label"].values
        x = df.drop(columns=["cancer_label", "patient_id"]).values
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

        model = xgboost.XGBRegressor(
            max_depth=10, n_estimators=80, learning_rate=0.05,
            min_child_weight=60, nthread=8, subsample=0.95,
            colsample_bytree=0.95, seed=random.randint(0, 500)
        )
        model.fit(
            x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], 
            eval_metric="logloss", early_stopping_rounds=5, verbose=False
        )

        train_preds = model.predict(x_train)
        test_preds = np.clip(model.predict(x_test), 0.001, 0.999)

        scores.append(log_loss(y_test, test_preds))
        print("mean_score:" ,np.mean(scores) ,"best:", model.best_score, "iter:", model.best_iteration)

        fpr, tpr, thresholds  =  roc_curve(y_test, test_preds); roc_auc =auc(fpr, tpr) 
        plt.figure(); plt.figure(figsize=(5,5)); font = {'family': 'Times New Roman','weight': 'normal','size': 15}
        plt.plot(fpr, tpr, color='darkorange',lw=2); plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--'); 
        plt.xlim([0.0, 1.0]); plt.ylim([0.0, 1.05]); plt.xlabel('False Positive Rate', font); plt.ylabel('True Positive Rate', font); 
        plt.title('ROC curve (AUC = %0.2f)' % roc_auc, font); plt.show()
    
        accuracy_train_list.append(accuracy_score(y_train, np.round(train_preds)))
        accuracy_test_list.append(accuracy_score(y_test, np.round(test_preds)))

        confusion_matrix_train += np.array([[sum((y_train == i) & (np.round(train_preds) == j)) for j in range(2)] for i in range(2)])
        confusion_matrix_test += np.array([[sum((y_test == i) & (np.round(test_preds) == j)) for j in range(2)] for i in range(2)])

        importance_list.append(model.feature_importances_)

    plt.bar(range(len(importance_list[0])), np.mean(importance_list, axis=0))
    plt.show()

    print(f"Average log loss: {np.mean(scores):.4f}")
    print(f"Train Accuracy: {np.mean(accuracy_train_list) * 100:.2f}%")
    print(f"Confusion Matrix (Train): \n{confusion_matrix_train / runs}")
    print(f"Test Accuracy: {np.mean(accuracy_test_list) * 100:.2f}%")
    print(f"Confusion Matrix (Test): \n{confusion_matrix_test / runs}")
