In [21]:
import math
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn import preprocessing

import numpy as np
import pandas as pd

import time
import random
import matplotlib.pyplot as plt
from scipy import interp
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from collections import Counter
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

In [37]:
def load_data(directory):
    GSSM = np.loadtxt(directory + '\GSSM_.txt',dtype=np.float32)
    PESSM = np.loadtxt(directory + '\PSSM.txt',dtype=np.float32,delimiter='\t')

    IPE = pd.DataFrame(PESSM).reset_index()
    IG = pd.DataFrame(GSSM).reset_index()
    IPE.rename(columns = {'index':'id'}, inplace = True)
    IG.rename(columns = {'index':'id'}, inplace = True)
    IPE['id'] = IPE['id']
    IG['id'] = IG['id']
    
    return IPE, IG

def sample(directory, random_seed):
    all_associations = pd.read_csv(directory + '/all_gpe_pairs.csv')
    known_associations = all_associations.loc[all_associations['label'] == 1]
    unknown_associations = all_associations.loc[all_associations['label'] == 0]
    random_negative = unknown_associations.sample(n=known_associations.shape[0], random_state=random_seed, axis=0)

    sample_df = known_associations.append(random_negative)
    sample_df.reset_index(drop=True, inplace=True)
    return sample_df


In [24]:
def performances(y_true, y_pred, y_prob):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels = [0, 1]).ravel().tolist()

    pos_acc = tp / sum(y_true)
    neg_acc = tn / (len(y_pred) - sum(y_pred)) # [y_true=0 & y_pred=0] / y_pred=0
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    
    recall = tp / (tp+fn)
    precision = tp / (tp+fp)
    f1 = 2*precision*recall / (precision+recall)
    
    roc_auc = roc_auc_score(y_true, y_prob)
    prec, reca, _ = precision_recall_curve(y_true, y_prob)
    aupr = auc(reca, prec)
    
    print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
    print('y_pred: 0 = {} | 1 = {}'.format(Counter(y_pred)[0], Counter(y_pred)[1]))
    print('y_true: 0 = {} | 1 = {}'.format(Counter(y_true)[0], Counter(y_true)[1]))
    print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}|pos_acc={:.4f}|neg_acc={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc))
    return (y_true, y_pred, y_prob), (accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc)

In [25]:
def obtain_data(directory, isbalance):

    IPE, IG = load_data(directory)

    if isbalance:
        dtp = sample(directory, random_seed = 1234)
    else:
        dtp = pd.read_csv(directory + '/all_gene_peco_pairs.csv')

    gene_ids = list(set(dtp['gene_idx']))
    peco_ids = list(set(dtp['peco_idx']))
    random.shuffle(gene_ids)
    random.shuffle(peco_ids)
    print('# gene = {} | peco = {}'.format(len(gene_ids), len(peco_ids)))

    gene_test_num = int(len(gene_ids) / 5)
    peco_test_num = int(len(peco_ids) / 5)
    print('# Test: gene = {} | peco = {}'.format(gene_test_num, peco_test_num))    
    
    samples = pd.merge(pd.merge(dtp, IPE, left_on = 'peco_idx', right_on = 'id'), IG, left_on = 'gene_idx', right_on = 'id')
    samples.drop(labels = ['id_x', 'id_y'], axis = 1, inplace = True)
    
    return IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples

In [26]:
def generate_task_Tp_train_test_idx(samples):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 1234)

    train_index_all, test_index_all, n = [], [], 0
    train_id_all, test_id_all = [], []
    fold = 0
    for train_idx, test_idx in tqdm(kf.split(samples.iloc[:, 3:])):
        print('-------Fold ', fold)
        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)

        train_id_all.append(np.array(dtp.iloc[train_idx][['gene_idx', 'peco_idx']]))
        test_id_all.append(np.array(dtp.iloc[test_idx][['gene_idx', 'peco_idx']]))

        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        fold += 1
    return train_index_all, test_index_all, train_id_all, test_id_all

In [27]:
def generate_task_Tg_Tpe_train_test_idx(item, ids, dtp):
    
    test_num = int(len(ids) / 5)
    
    train_index_all, test_index_all = [], []
    train_id_all, test_id_all = [], []
    
    for fold in range(5):
        print('-------Fold ', fold)
        if fold != 4:
            test_ids = ids[fold * test_num : (fold + 1) * test_num]
        else:
            test_ids = ids[fold * test_num :]

        train_ids = list(set(ids) ^ set(test_ids))
        print('# {}: Train = {} | Test = {}'.format(item, len(train_ids), len(test_ids)))

        test_idx = dtp[dtp[item].isin(test_ids)].index.tolist()
        train_idx = dtp[dtp[item].isin(train_ids)].index.tolist()
        random.shuffle(test_idx)
        random.shuffle(train_idx)
        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        assert len(train_idx) + len(test_idx) == len(dtp)

        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)
        
        train_id_all.append(train_ids)
        test_id_all.append(test_ids)
        
    return train_index_all, test_index_all, train_id_all, test_id_all

In [35]:
def run_clf(train_index_all, test_index_all, samples, classfier):
    
    fold = 0
    for train_idx, test_idx in zip(train_index_all, test_index_all):
        print('-----------------------Fold = ', str(fold))

        X = samples.iloc[:, 3:]
        y = samples['label']

        scaler = preprocessing.MinMaxScaler().fit(X.iloc[train_idx,:])
        X = scaler.transform(X)

        x_train, y_train = X[train_idx], y[train_idx]
        x_test, y_test = X[test_idx], y[test_idx]

        if classfier == 'ERT':
            clf = ExtraTreesClassifier(random_state = 19961231)
        elif classfier == 'GNB':
            clf = GaussianNB()
        elif classfier == 'DT':
            clf = DecisionTreeClassifier(random_state = 19961231)
        elif classfier == 'SGD':
            clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=5)
        elif classfier == 'SVM':
            clf = svm.SVC(C=0.1, kernel='sigmoid', degree=3, gamma='auto', probability=True)
        elif classfier == 'LR':
            clf = LogisticRegression(max_iter=500)
        elif classfier == 'MLP':
            clf = MLPClassifier() 
        elif classfier == 'GBDT':
            clf = GradientBoostingClassifier(max_depth=5,learning_rate=0.1)   
            
        clf.fit(x_train, y_train)

        y_train_prob = clf.predict_proba(x_train)
        y_test_prob = clf.predict_proba(x_test)

        y_train_pred = clf.predict(x_train)
        y_test_pred = clf.predict(x_test)

        print('Train:')
        ys_train, metrics_train = performances(y_train, y_train_pred, y_train_prob[:, 1])
        print('Test:')
        ys_test, metrics_test = performances(y_test, y_test_pred, y_test_prob[:, 1])

        fold += 1
    
    return ys_train, metrics_train, ys_test, metrics_test

In [11]:
directory = '..\..\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp','Tg','Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'ERT')    
        precision, recall, threshold = precision_recall_curve(ys_test[0], ys_test[2], pos_label=1)
      
        


5it [00:00, 278.39it/s]

# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# Pairs: Train = 37692 | Test = 9424
-------Fold  1
# Pairs: Train = 37693 | Test = 9423
-------Fold  2
# Pairs: Train = 37693 | Test = 9423
-------Fold  3
# Pairs: Train = 37693 | Test = 9423
-------Fold  4
# Pairs: Train = 37693 | Test = 9423
-----------------------Fold =  0





Train:
tn = 18890, fp = 0, fn = 0, tp = 18802
y_pred: 0 = 18890 | 1 = 18802
y_true: 0 = 18890 | 1 = 18802
acc=1.0000|precision=1.0000|recall=1.0000|f1=1.0000|auc=1.0000|aupr=1.0000|pos_acc=1.0000|neg_acc=1.0000
Test:
tn = 3965, fp = 703, fn = 641, tp = 4115
y_pred: 0 = 4606 | 1 = 4818
y_true: 0 = 4668 | 1 = 4756
acc=0.8574|precision=0.8541|recall=0.8652|f1=0.8596|auc=0.9414|aupr=0.9449|pos_acc=0.8652|neg_acc=0.8608
-----------------------Fold =  1
Train:
tn = 18821, fp = 0, fn = 0, tp = 18872
y_pred: 0 = 18821 | 1 = 18872
y_true: 0 = 18821 | 1 = 18872
acc=1.0000|precision=1.0000|recall=1.0000|f1=1.0000|auc=1.0000|aupr=1.0000|pos_acc=1.0000|neg_acc=1.0000
Test:
tn = 4033, fp = 704, fn = 596, tp = 4090
y_pred: 0 = 4629 | 1 = 4794
y_true: 0 = 4737 | 1 = 4686
acc=0.8620|precision=0.8531|recall=0.8728|f1=0.8629|auc=0.9422|aupr=0.9436|pos_acc=0.8728|neg_acc=0.8712
-----------------------Fold =  2
Train:
tn = 18881, fp = 0, fn = 0, tp = 18812
y_pred: 0 = 18881 | 1 = 18812
y_true: 0 = 18881 | 

In [12]:
directory = '..\..\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp','Tg','Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'GNB')
        precision, recall, threshold = precision_recall_curve(ys_test[0], ys_test[2], pos_label=1)

5it [00:00, 277.14it/s]

# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# Pairs: Train = 37692 | Test = 9424
-------Fold  1
# Pairs: Train = 37693 | Test = 9423
-------Fold  2
# Pairs: Train = 37693 | Test = 9423
-------Fold  3
# Pairs: Train = 37693 | Test = 9423
-------Fold  4
# Pairs: Train = 37693 | Test = 9423
-----------------------Fold =  0
Train:





tn = 16134, fp = 2756, fn = 3363, tp = 15439
y_pred: 0 = 19497 | 1 = 18195
y_true: 0 = 18890 | 1 = 18802
acc=0.8377|precision=0.8485|recall=0.8211|f1=0.8346|auc=0.9350|aupr=0.9393|pos_acc=0.8211|neg_acc=0.8275
Test:
tn = 3982, fp = 686, fn = 884, tp = 3872
y_pred: 0 = 4866 | 1 = 4558
y_true: 0 = 4668 | 1 = 4756
acc=0.8334|precision=0.8495|recall=0.8141|f1=0.8314|auc=0.9338|aupr=0.9405|pos_acc=0.8141|neg_acc=0.8183
-----------------------Fold =  1
Train:
tn = 16077, fp = 2744, fn = 3418, tp = 15454
y_pred: 0 = 19495 | 1 = 18198
y_true: 0 = 18821 | 1 = 18872
acc=0.8365|precision=0.8492|recall=0.8189|f1=0.8338|auc=0.9348|aupr=0.9395|pos_acc=0.8189|neg_acc=0.8247
Test:
tn = 4039, fp = 698, fn = 829, tp = 3857
y_pred: 0 = 4868 | 1 = 4555
y_true: 0 = 4737 | 1 = 4686
acc=0.8379|precision=0.8468|recall=0.8231|f1=0.8348|auc=0.9371|aupr=0.9410|pos_acc=0.8231|neg_acc=0.8297
-----------------------Fold =  2
Train:
tn = 16112, fp = 2769, fn = 3413, tp = 15399
y_pred: 0 = 19525 | 1 = 18168
y_true: 0

In [13]:
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp','Tg','Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'DT')
        precision, recall, threshold = precision_recall_curve(ys_test[0], ys_test[2], pos_label=1)
        data1 = pd.DataFrame(zip(precision, recall), columns=['precision','recall']).to_csv(r'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\评价指标统计图\prc\DT.csv')


5it [00:00, 227.25it/s]

# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# Pairs: Train = 37692 | Test = 9424
-------Fold  1
# Pairs: Train = 37693 | Test = 9423
-------Fold  2
# Pairs: Train = 37693 | Test = 9423
-------Fold  3
# Pairs: Train = 37693 | Test = 9423
-------Fold  4
# Pairs: Train = 37693 | Test = 9423
-----------------------Fold =  0





Train:
tn = 18890, fp = 0, fn = 0, tp = 18802
y_pred: 0 = 18890 | 1 = 18802
y_true: 0 = 18890 | 1 = 18802
acc=1.0000|precision=1.0000|recall=1.0000|f1=1.0000|auc=1.0000|aupr=1.0000|pos_acc=1.0000|neg_acc=1.0000
Test:
tn = 3806, fp = 862, fn = 922, tp = 3834
y_pred: 0 = 4728 | 1 = 4696
y_true: 0 = 4668 | 1 = 4756
acc=0.8107|precision=0.8164|recall=0.8061|f1=0.8113|auc=0.8107|aupr=0.8602|pos_acc=0.8061|neg_acc=0.8050
-----------------------Fold =  1
Train:
tn = 18821, fp = 0, fn = 0, tp = 18872
y_pred: 0 = 18821 | 1 = 18872
y_true: 0 = 18821 | 1 = 18872
acc=1.0000|precision=1.0000|recall=1.0000|f1=1.0000|auc=1.0000|aupr=1.0000|pos_acc=1.0000|neg_acc=1.0000
Test:
tn = 3802, fp = 935, fn = 846, tp = 3840
y_pred: 0 = 4648 | 1 = 4775
y_true: 0 = 4737 | 1 = 4686
acc=0.8110|precision=0.8042|recall=0.8195|f1=0.8118|auc=0.8110|aupr=0.8567|pos_acc=0.8195|neg_acc=0.8180
-----------------------Fold =  2
Train:
tn = 18881, fp = 0, fn = 0, tp = 18812
y_pred: 0 = 18881 | 1 = 18812
y_true: 0 = 18881 | 

In [14]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'SGD')
        precision, recall, threshold = precision_recall_curve(ys_test[0], ys_test[2], pos_label=1)
        data1 = pd.DataFrame(zip(precision, recall), columns=['precision','recall']).to_csv(r'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\评价指标统计图\prc\SGD.csv')
#         fpr, tpr, thresholds = roc_curve(ys_test[0], ys_test[2])
#         data = pd.DataFrame(zip(fpr, tpr), columns=['fpr','tpr']).to_csv(r'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\评价指标统计图\roc\SGD.csv',index=False)

5it [00:00, 294.00it/s]

# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# Pairs: Train = 37692 | Test = 9424
-------Fold  1
# Pairs: Train = 37693 | Test = 9423
-------Fold  2
# Pairs: Train = 37693 | Test = 9423
-------Fold  3
# Pairs: Train = 37693 | Test = 9423
-------Fold  4
# Pairs: Train = 37693 | Test = 9423
-----------------------Fold =  0
Train:





tn = 18814, fp = 76, fn = 10490, tp = 8312
y_pred: 0 = 29304 | 1 = 8388
y_true: 0 = 18890 | 1 = 18802
acc=0.7197|precision=0.9909|recall=0.4421|f1=0.6114|auc=0.8491|aupr=0.9039|pos_acc=0.4421|neg_acc=0.6420
Test:
tn = 4657, fp = 11, fn = 2643, tp = 2113
y_pred: 0 = 7300 | 1 = 2124
y_true: 0 = 4668 | 1 = 4756
acc=0.7184|precision=0.9948|recall=0.4443|f1=0.6142|auc=0.8525|aupr=0.9083|pos_acc=0.4443|neg_acc=0.6379
-----------------------Fold =  1
Train:
tn = 16867, fp = 1954, fn = 3538, tp = 15334
y_pred: 0 = 20405 | 1 = 17288
y_true: 0 = 18821 | 1 = 18872
acc=0.8543|precision=0.8870|recall=0.8125|f1=0.8481|auc=0.9143|aupr=0.9322|pos_acc=0.8125|neg_acc=0.8266
Test:
tn = 4266, fp = 471, fn = 894, tp = 3792
y_pred: 0 = 5160 | 1 = 4263
y_true: 0 = 4737 | 1 = 4686
acc=0.8551|precision=0.8895|recall=0.8092|f1=0.8475|auc=0.9163|aupr=0.9340|pos_acc=0.8092|neg_acc=0.8267
-----------------------Fold =  2
Train:
tn = 14513, fp = 4368, fn = 1187, tp = 17625
y_pred: 0 = 15700 | 1 = 21993
y_true: 0 = 

In [18]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'LR')
        precision, recall, threshold = precision_recall_curve(ys_test[0], ys_test[2], pos_label=1)
        data1 = pd.DataFrame(zip(precision, recall), columns=['precision','recall']).to_csv(r'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\评价指标统计图\prc\LR.csv')

23558
1528
10396
86
121
1179
126
5
14
45
4992
7
67
312
853
148
29
831
45
476
2239
1
51
1
6
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 45066 | Test = 2050
-------Fold  1
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 43726 | Test = 3390
-------Fold  2
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 42256 | Test = 4860
-------Fold  3
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 44564 | Test = 2552
-------Fold  4
# peco_idx: Train = 16 | Test = 8
# Pairs: Train = 12852 | Test = 34264
-----------------------Fold =  0
Train:
tn = 11772, fp = 10497, fn = 10988, tp = 11809
y_pred: 0 = 22760 | 1 = 22306
y_true: 0 = 22269 | 1 = 22797
acc=0.5233|precision=0.5294|recall=0.5180|f1=0.5236|auc=0.5358|aupr=0.5405|pos_acc=0.5180|neg_acc=0.5172
Test:
tn = 628, fp = 661, fn = 396, tp = 365
y_pred: 0 = 1024 | 1 = 1026
y_true: 0 = 1289 | 1 = 761
acc=0.4844|precision=0.3558|recall=0.4796|f1=0.4085|auc=0.4903|aupr=

In [31]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'MLP')
        precision, recall, threshold = precision_recall_curve(ys_test[0], ys_test[2], pos_label=1)
        data1 = pd.DataFrame(zip(precision, recall), columns=['precision','recall']).to_csv(r'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\评价指标统计图\prc\MLP.csv')

23558
1528
10396
86
121
1179
126
5
14
45
4992
7
67
312
853
148
29
831
45
476
2239
1
51
1
6
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 45322 | Test = 1794
-------Fold  1
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 44404 | Test = 2712
-------Fold  2
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 15386 | Test = 31730
-------Fold  3
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 45144 | Test = 1972
-------Fold  4
# peco_idx: Train = 16 | Test = 8
# Pairs: Train = 38208 | Test = 8908
-----------------------Fold =  0
Train:
tn = 18237, fp = 4188, fn = 14837, tp = 8060
y_pred: 0 = 33074 | 1 = 12248
y_true: 0 = 22425 | 1 = 22897
acc=0.5802|precision=0.6581|recall=0.3520|f1=0.4587|auc=0.6341|aupr=0.6495|pos_acc=0.3520|neg_acc=0.5514
Test:
tn = 770, fp = 363, fn = 453, tp = 208
y_pred: 0 = 1223 | 1 = 571
y_true: 0 = 1133 | 1 = 661
acc=0.5452|precision=0.3643|recall=0.3147|f1=0.3377|auc=0.5126|aupr=0.3

In [38]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp','Tg']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'GBDT')
        precision, recall, threshold = precision_recall_curve(ys_test[0], ys_test[2], pos_label=1)
        data1 = pd.DataFrame(zip(precision, recall), columns=['precision','recall']).to_csv(r'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\评价指标统计图\prc\GBDT.csv')

5it [00:00, 147.06it/s]

# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# Pairs: Train = 37692 | Test = 9424
-------Fold  1
# Pairs: Train = 37693 | Test = 9423
-------Fold  2
# Pairs: Train = 37693 | Test = 9423
-------Fold  3
# Pairs: Train = 37693 | Test = 9423
-------Fold  4
# Pairs: Train = 37693 | Test = 9423
-----------------------Fold =  0





Train:
tn = 16259, fp = 2631, fn = 1917, tp = 16885
y_pred: 0 = 18176 | 1 = 19516
y_true: 0 = 18890 | 1 = 18802
acc=0.8793|precision=0.8652|recall=0.8980|f1=0.8813|auc=0.9567|aupr=0.9584|pos_acc=0.8980|neg_acc=0.8945
Test:
tn = 3944, fp = 724, fn = 565, tp = 4191
y_pred: 0 = 4509 | 1 = 4915
y_true: 0 = 4668 | 1 = 4756
acc=0.8632|precision=0.8527|recall=0.8812|f1=0.8667|auc=0.9432|aupr=0.9452|pos_acc=0.8812|neg_acc=0.8747
-----------------------Fold =  1
Train:
tn = 16297, fp = 2524, fn = 2004, tp = 16868
y_pred: 0 = 18301 | 1 = 19392
y_true: 0 = 18821 | 1 = 18872
acc=0.8799|precision=0.8698|recall=0.8938|f1=0.8817|auc=0.9567|aupr=0.9585|pos_acc=0.8938|neg_acc=0.8905
Test:
tn = 4040, fp = 697, fn = 572, tp = 4114
y_pred: 0 = 4612 | 1 = 4811
y_true: 0 = 4737 | 1 = 4686
acc=0.8653|precision=0.8551|recall=0.8779|f1=0.8664|auc=0.9450|aupr=0.9443|pos_acc=0.8779|neg_acc=0.8760
-----------------------Fold =  2
Train:
tn = 16330, fp = 2551, fn = 1955, tp = 16857
y_pred: 0 = 18285 | 1 = 19408
y_