In [1]:
import os
import h5py
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score


In [2]:

def get_DMatrix_concat_useevo(VCF_FILE, CHROMATIN_df, CHROMATIN_diff_df, EVO_df):

    sub_df_ori, sub_diff_df_ori, sub_evo_df_ori = CHROMATIN_df, CHROMATIN_diff_df, EVO_df
    
    print('Chromatin feature df shape: ', sub_df_ori.shape)
    print('Chromatin feature df shape: ', sub_diff_df_ori.shape)
    print('Chromatin evo feature df shape: ', sub_evo_df_ori.shape)


    sub_df_ori['label'] = np.where(sub_df_ori.name.str.startswith('1000G'), 0, 1)
    sub_diff_df_ori['label'] = np.where(sub_diff_df_ori.name.str.startswith('1000G'), 0, 1)

    if len(sub_df_ori[sub_df_ori.isnull().values == True]) != 0:
        print('Have NA!')
    if len(sub_diff_df_ori[sub_diff_df_ori.isnull().values == True]) != 0:
        print('Have NA!')

    sub_df = sub_df_ori.iloc[:, 5:]
    sub_diff_df = sub_diff_df_ori.iloc[:, 5:]
    y = np.asarray(sub_df['label'])
    X = np.asarray(sub_df.drop(['label'], axis=1))
    X_diff = np.asarray(sub_diff_df.drop(['label'], axis=1))
    print("X.shape: {}, X_diff.shape: {}, y.shape: {}".format(X.shape, X_diff.shape, y.shape))

    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    std_X = ss.fit_transform(X)
    std_X_diff = ss.fit_transform(X_diff)
    print('after standard mean and std is ', np.mean(std_X), np.std(std_X))
    print('after standard mean and std is ', np.mean(std_X_diff), np.std(std_X_diff))

    sub_evo_df = sub_evo_df_ori.iloc[:, 5:]

    std_X_hstack = np.hstack((X, X_diff))
    std_X_hstack = np.hstack((std_X_hstack, sub_evo_df))  # 919+919+4
    print(std_X_hstack.shape)
    DM_Xy = xgb.DMatrix(std_X_hstack, y)

    return DM_Xy, std_X, X, y, sub_df_ori, sub_diff_df_ori

## 354clinvar——C2P

In [70]:
EVO_df_path = '/alldata/LChuang_data/myP/GeneBert/BGI-Gene_new/examples/CADD/baiducloud_clinvar/clinvar_20201003/Clinvar_nc_snv_pathogenic/GBERT-C2P/Clinvar_nc_snv_pathogenic_354.vcf_getevo_output/infile.vcf.evoall.wholerow'

VCF_PATH = '/alldata/LChuang_data/myP/GeneBert/BGI-Gene_new/examples/CADD/baiducloud_clinvar/clinvar_20201003/Clinvar_nc_snv_pathogenic/GBERT-C2P/Clinvar_nc_snv_pathogenic_354.vcf'

CHROM_PATH = '/alldata/LChuang_data/myP/GeneBert/BGI-Gene_new/examples/CADD/baiducloud_clinvar/clinvar_20201003/Clinvar_nc_snv_pathogenic/GBERT-funsig/'

outputpath = '/alldata/LChuang_data/myP/GeneBert/BGI-Gene_new/examples/CADD/baiducloud_clinvar/clinvar_20201003/Clinvar_nc_snv_pathogenic/GBERT-C2P/'

xgboost_path = "/alldata/LChuang_data/myP/GeneBert/BGI-Gene_new/examples/HGMD_logistic/XGboost_tri_addevo_feature_nostd/"

In [82]:

mark_type_list = [919, 2002, 3540]
#mark_type_list = [919]
nth=32

for mark_type in mark_type_list:

    print('############ Part1: Getting chromatin feature from {} file ############'.format(mark_type))

    # logfoldchange
    CHROMATIN_FILE = CHROM_PATH + os.path.basename(VCF_PATH) + \
                     '_128bs_5gram_{}feature.out.logfoldchange.csv'.format(mark_type)
    CHROMATIN_df = pd.read_csv(CHROMATIN_FILE)  # CHROMATIN_df.head()

    # diff
    CHROMATIN_FILE_diff = CHROM_PATH + os.path.basename(VCF_PATH) + \
                          '_128bs_5gram_{}feature.out.diff.csv'.format(mark_type)
    CHROMATIN_diff_df = pd.read_csv(CHROMATIN_FILE_diff)  # CHROMATIN_diff_df.head()

    # evoall
    EVO_df = pd.read_csv(EVO_df_path)  # CHROMATIN_diff_df.head()

    # vcf
    acc_tem_list, AUROC_tem_list = [], []  # storage acc auroc
    
    #for ii in range(10):
    for ii in [8]: # the beas weight

        print('############ Part1: Getting DMatrix with {} marks ############'.format(mark_type))
        DM_Xy_test, std_X_test, X_test, y_test, test_df_ori, test_diff_df_ori \
            = get_DMatrix_concat_useevo(VCF_PATH, CHROMATIN_df, CHROMATIN_diff_df, EVO_df)

        print('############ Part2: Getting Xgboost weight with shuffle{} in {} marks ############'.format(ii, mark_type))
        # Training
        #print('Saving model: \n', out_model)
        bst_new = xgb.Booster({'nthread':nth}) #init model
        weight_path = xgboost_path + "1000G_HGMD_posstrand_8softwares_5_test_shuffle{}_XGboost_{}mark_Trible.model".format(ii, mark_type)
        bst_new.load_model(weight_path) # load data

        
        # Predicting
        print('############ Part3: Evaluating with {} marks ############'.format(mark_type))
        pred_test_raw = bst_new.predict(DM_Xy_test)
        pred_test_acc = bst_new.predict(DM_Xy_test)
        for i in range(len(pred_test_raw)):
            if pred_test_acc[i] > 0.5:
                pred_test_acc[i] = 1
            else:
                pred_test_acc[i] = 0

        acc_tem = accuracy_score(DM_Xy_test.get_label(), pred_test_acc)
        AUROC_tem = roc_auc_score(DM_Xy_test.get_label(), pred_test_raw)
        print('acc:', acc_tem)
        print('AUROC:', AUROC_tem)
        acc_tem_list.append(acc_tem)
        AUROC_tem_list.append(AUROC_tem)

        # Saving
        print('############ Part5: Saving Xgboost output in {} marks ############'.format(mark_type))
        test_df = test_df_ori.iloc[:, :5]
        test_df['value'] = pred_test_raw
        basename = os.path.basename(VCF_PATH)
        out = outputpath + basename.replace('.vcf','_XGboost_shuffle{}_{}mark_Trible.predict'.format(ii, mark_type))
        test_df.to_csv(out, sep='\t', index=None, header=None)
        print('Saving prediction file: \n', out)

    print("mean_acc={}, mean_AUROC={} ".format(np.mean(acc_tem_list), np.mean(AUROC_tem_list)))


############ Part1: Getting chromatin feature from 919 file ############
############ Part1: Getting DMatrix with 919 marks ############
Chromatin feature df shape:  (354, 924)
Chromatin feature df shape:  (354, 924)
Chromatin evo feature df shape:  (354, 9)
X.shape: (354, 919), X_diff.shape: (354, 919), y.shape: (354,)
after standard mean and std is  6.989102483147122e-19 0.9999999999999999
after standard mean and std is  8.736378103933903e-19 1.0
(354, 1842)
############ Part2: Getting Xgboost weight with shuffle8 in 919 marks ############
############ Part3: Evaluating with 919 marks ############
acc: 0.8870056497175142
AUROC: 0.9324268249864344
############ Part5: Saving Xgboost output in 919 marks ############
Saving prediction file: 
 /alldata/LChuang_data/myP/GeneBert/BGI-Gene_new/examples/CADD/baiducloud_clinvar/clinvar_20201003/Clinvar_nc_snv_pathogenic/GBERT-C2P/Clinvar_nc_snv_pathogenic_354_XGboost_shuffle8_919mark_Trible.predict
mean_acc=0.8870056497175142, mean_AUROC=0.932