In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from collections import Counter
import re
from tqdm import trange

In [5]:
pwd = '../../data/'
peco_id_name = pd.read_excel(pwd + 'peco_name.xlsx')
gene_id_name = pd.read_excel(pwd + 'gene_name.xlsx')

In [6]:
def metrics(y_true, y_pred, y_prob):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    pos_acc = tp / sum(y_true)
    neg_acc = tn / (len(y_pred) - sum(y_pred)) # [y_true=0 & y_pred=0] / y_pred=0
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    
    recall = tp / (tp+fn)
    precision = tp / (tp+fp)
    f1 = 2*precision*recall / (precision+recall)
    
    roc_auc = roc_auc_score(y_true, y_prob)
    prec, reca, _ = precision_recall_curve(y_true, y_prob)
    aupr = auc(reca, prec)
    average1 = (accuracy + precision + recall + roc_auc + aupr) / 5
    average2 = (accuracy + f1 + roc_auc + aupr) / 4
    average3 = (f1 + aupr) / 2
    print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
    print('y_pred: 0 = {} | 1 = {}'.format(Counter(y_pred)[0], Counter(y_pred)[1]))
    print('y_true: 0 = {} | 1 = {}'.format(Counter(y_true)[0], Counter(y_true)[1]))
    print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}|pos_acc={:.4f}|neg_acc={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc))
    print('{:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr, average1, average2, average3))

In [25]:
def train_test_file():
    train_test_id_idx = np.load('../../data/task_Tp__testlabel0_knn_edge_train_test_index_all.npz', allow_pickle = True)
    train_index_all = train_test_id_idx['train_index_all']
    test_index_all = train_test_id_idx['test_index_all']
    train_id_all = train_test_id_idx['train_id_all'] # 'gene', 'peco'
    test_id_all = train_test_id_idx['test_id_all'] # 'gene', 'peco'
    return test_index_all, test_id_all, (train_index_all, train_id_all)

In [58]:
def balanced_results_file(): #weight = None
    file = np.load("ys.npz")
    y_true_train, y_pred_train, y_prob_train = file['arr_0'][0], file['arr_0'][1], file['arr_0'][2]
    y_true_test, y_pred_test, y_prob_test = file['arr_1'][0], file['arr_1'][1], file['arr_1'][2] 
    
    print('Train:')
    metrics(y_true_train, y_pred_train, y_prob_train)
    print('Test:')
    metrics(y_true_test, y_pred_test, y_prob_test)
    
    return y_true_test, y_pred_test, y_prob_test, (y_true_train, y_pred_train, y_prob_train)

In [112]:
df = pd.read_csv('Tp_balanced_case_study_0.csv').groupby('trait')
for name,group in df:
    a = pd.DataFrame(group)
    if a['y_true'].mean()==0 or a['y_true'].mean()==1:        
        print('**')
    else:
        print(name)
        metrics(a['y_true'], a['y_pred'], a['y_prob'])        
        print('*******************')

5-methyltryptophan exposure
tn = 204, fp = 21, fn = 2, tp = 23
y_pred: 0 = 206 | 1 = 44
y_true: 0 = 225 | 1 = 25
acc=0.9080|precision=0.5227|recall=0.9200|f1=0.6667|auc=0.9737|aupr=0.8737|pos_acc=0.9200|neg_acc=0.9903
0.9080, 0.5227, 0.9200, 0.6667, 0.9737, 0.8737, 0.8396, 0.8555, 0.7702
*******************
Magnaporthe grisea exposure
tn = 106, fp = 10, fn = 72, tp = 911
y_pred: 0 = 178 | 1 = 921
y_true: 0 = 116 | 1 = 983
acc=0.9254|precision=0.9891|recall=0.9268|f1=0.9569|auc=0.9740|aupr=0.9969|pos_acc=0.9268|neg_acc=0.5955
0.9254, 0.9891, 0.9268, 0.9569, 0.9740, 0.9969, 0.9624, 0.9633, 0.9769
*******************
Nilaparvata lugens exposure
tn = 182, fp = 29, fn = 4, tp = 12
y_pred: 0 = 186 | 1 = 41
y_true: 0 = 211 | 1 = 16
acc=0.8546|precision=0.2927|recall=0.7500|f1=0.4211|auc=0.9162|aupr=0.4719|pos_acc=0.7500|neg_acc=0.9785
0.8546, 0.2927, 0.7500, 0.4211, 0.9162, 0.4719, 0.6571, 0.6659, 0.4465
*******************
Pseudomonas avenae exposure
tn = 174, fp = 15, fn = 2, tp = 11
y_pred

  f1 = 2*precision*recall / (precision+recall)


In [59]:
def sample(random_seed):
    all_associations = pd.read_csv('D:/小麦/MDA-GCNFTG-main/MDA-GCNFTG-main/data/all_gpe_pairs.csv', names=['gene', 'disease', 'label'])
    known_associations = all_associations.loc[all_associations['label'] == 1]
    unknown_associations = all_associations.loc[all_associations['label'] == 0]
    random_negative = unknown_associations.sample(n=known_associations.shape[0], random_state=random_seed, axis=0)

    sample_df = known_associations.append(random_negative)
    sample_df.reset_index(drop=True, inplace=True)

    return sample_df

In [78]:
def run_balanced_Tp(task, balance, knn, lr):
    test_index_all, test_id_all, _ = train_test_file()# '__nobalance'

    for i in range(1):
        print('==== Fold ', i)
        y_true_test, y_pred_test, y_prob_test, _ = balanced_results_file()

        if i == 0:
             y_true_test_all, y_pred_test_all, y_prob_test_all = y_true_test, y_pred_test, y_prob_test
            
        else:
            y_true_test_all = np.vstack([y_true_test_all, y_true_test])
            y_pred_test_all = np.vstack([y_pred_test_all, y_pred_test])
            y_prob_test_all = np.vstack([y_prob_test_all, y_prob_test])
            assert (y_prob_test_all[i] == y_prob_test).all()

    results_df = pd.DataFrame(test_id_all[0].reshape(-1, 2), columns = ['gene', 'peco'])
    print(len(results_df))
    print(len(y_true_test_all.reshape(-1)))
    results_df['y_true'] = y_true_test_all.reshape(-1)
    results_df['y_pred'] = y_pred_test_all.reshape(-1)
    results_df['y_prob'] = y_prob_test_all.reshape(-1)

    print(results_df)
    print(gene_id_name)
    results_df = pd.merge(results_df, gene_id_name, left_on = 'gene', right_index = True)
    results_df = pd.merge(results_df, peco_id_name, left_on = 'peco', right_index = True)
    #results_df.drop(labels = ['id_x', 'id_y'], axis = 1, inplace = True)
    results_df.sort_values(by = ['peco', 'y_prob'], ascending = False, inplace = True)
    
    results_df.to_csv(task + '_balanced_case_study_0.csv')
    
    return results_df

In [79]:
def run_balanced_Tgp(task, balance, knn, lr):
    dtp = sample(random_seed = 1234)
    test_index_all, test_id_all, _ = train_test_file(task, balance)# '__nobalance'

    for i in range(5):
        print('==== Fold ', i)
        y_true_test, y_pred_test, y_prob_test, _ = balanced_results_file(task, knn, lr, fold = i)

        temp = dtp.iloc[test_index_all[i]][['gene', 'peco']]
        if i == 0:
            y_true_test_all, y_pred_test_all, y_prob_test_all = y_true_test, y_pred_test, y_prob_test
            
            results_df = temp
        else:
            y_true_test_all = np.hstack([y_true_test_all, y_true_test])
            y_pred_test_all = np.hstack([y_pred_test_all, y_pred_test])
            y_prob_test_all = np.hstack([y_prob_test_all, y_prob_test])
            
            results_df = pd.concat([results_df, temp], axis = 0)
            
    results_df['y_true'] = y_true_test_all.reshape(-1)
    results_df['y_pred'] = y_pred_test_all.reshape(-1)
    results_df['y_prob'] = y_prob_test_all.reshape(-1)

    results_df = pd.merge(results_df, gene_id_name, left_on = 'gene', right_on = 'id')
    results_df = pd.merge(results_df, peco_id_name, left_on = 'peco', right_on = 'id')
    results_df.drop(labels = ['id_x', 'id_y'], axis = 1, inplace = True)
    results_df.sort_values(by = ['peco_x', 'y_prob'], ascending = False, inplace = True)
    
    results_df.to_csv(task + '_balanced_case_study_0.csv')
    
    return results_df

# Run balanced

In [80]:
results_Tp_balanced = run_balanced_Tp(task = 'Tp', balance = '', knn = '10knn', lr = 0.001)
results_Tp_balanced

==== Fold  0
Train:
tn = 1336, fp = 113, fn = 172, tp = 1250
y_pred: 0 = 1508 | 1 = 1363
y_true: 0 = 1449 | 1 = 1422
acc=0.9007|precision=0.9171|recall=0.8790|f1=0.8977|auc=0.9685|aupr=0.9699|pos_acc=0.8790|neg_acc=0.8859
0.9007, 0.9171, 0.8790, 0.8977, 0.9685, 0.9699, 0.9270, 0.9342, 0.9338
Test:
tn = 4133, fp = 533, fn = 645, tp = 4113
y_pred: 0 = 4778 | 1 = 4646
y_true: 0 = 4666 | 1 = 4758
acc=0.8750|precision=0.8853|recall=0.8644|f1=0.8747|auc=0.9428|aupr=0.9492|pos_acc=0.8644|neg_acc=0.8650
0.8750, 0.8853, 0.8644, 0.8747, 0.9428, 0.9492, 0.9033, 0.9104, 0.9120
9424
9424
       gene  peco  y_true  y_pred    y_prob
0        33     0     1.0     1.0  0.997558
1        94     0     1.0     1.0  0.998668
2        97     0     1.0     1.0  0.998778
3       155     0     1.0     1.0  0.999781
4       214     0     1.0     1.0  1.000000
...     ...   ...     ...     ...       ...
9419   4839     3     0.0     1.0  0.508719
9420   5250    19     0.0     0.0  0.109928
9421   7394    14     

Unnamed: 0,gene,gene_x,peco,y_true,y_pred,y_prob,gene_y,trait
7471,2020,2020,31,0.0,1.0,0.986779,LOC_Os06g20410,laboratory study
6785,5236,5236,31,0.0,1.0,0.961113,LOC_Os04g12890,laboratory study
5374,9991,9991,31,0.0,1.0,0.945579,LOC_Os05g46720,laboratory study
6716,8717,8717,31,0.0,1.0,0.943131,LOC_Os01g29409,laboratory study
6939,4814,4814,31,0.0,1.0,0.931262,LOC_Os06g22550,laboratory study
...,...,...,...,...,...,...,...,...
8885,2330,2330,0,0.0,0.0,0.000093,LOC_Os11g38860,sodium chloride exposure
7238,6070,6070,0,0.0,0.0,0.000074,LOC_Os03g24860,sodium chloride exposure
8010,6463,6463,0,0.0,0.0,0.000067,LOC_Os02g32760,sodium chloride exposure
7237,8836,8836,0,0.0,0.0,0.000035,LOC_Os04g58840,sodium chloride exposure
