In [1]:
# https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes
# https://scikit-learn.org/stable/modules/tree.html#classification
# https://scikit-learn.org/stable/modules/sgd.html#classification
# https://scikit-learn.org/stable/modules/svm.html#classification
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier

In [1]:
import math
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn import preprocessing

import numpy as np
import pandas as pd

import time
import random
import matplotlib.pyplot as plt
from scipy import interp
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from collections import Counter
from tqdm import tqdm

In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve

In [3]:
def load_data(directory):
    GSSM = np.loadtxt(directory + '\GSSM_.txt',dtype=np.float32)
    PESSM = np.loadtxt(directory + '\PSSM.txt',dtype=np.float32,delimiter='\t')

    IPE = pd.DataFrame(PESSM).reset_index()
    IG = pd.DataFrame(GSSM).reset_index()
    IPE.rename(columns = {'index':'id'}, inplace = True)
    IG.rename(columns = {'index':'id'}, inplace = True)
    IPE['id'] = IPE['id']
    IG['id'] = IG['id']
    
    return IPE, IG

# def sample(directory, random_seed):
#     all_associations = pd.read_csv(directory + '/all_gpe_pairs.csv')
#     known_associations = all_associations.loc[all_associations['label'] == 1]
#     unknown_associations = all_associations.loc[all_associations['label'] == 0]
#     random_negative = unknown_associations.sample(n=known_associations.shape[0], random_state=random_seed, axis=0)

#     sample_df = known_associations.append(random_negative)
#     sample_df.reset_index(drop=True, inplace=True)

#     return sample_df
def sample(directory, random_seed):
    all_associations = pd.read_csv(directory + '/all_gpe_pairs.csv')
    known_associations = all_associations.loc[all_associations['label'] == 1]
    print(len(known_associations))
    peco_ids = list(set(known_associations['peco_idx']))
    unknown_associations = all_associations.loc[all_associations['label'] == 0]
    sample_df = known_associations
    for peco_id in peco_ids:
        random_negative = unknown_associations.loc[all_associations['peco_idx'] == peco_id].sample(n=known_associations.loc[all_associations['peco_idx'] == peco_id].shape[0], random_state=random_seed, axis=0, replace=True)
        print(len(random_negative))
        sample_df = pd.concat([sample_df,random_negative], axis=0)

    sample_df.reset_index(drop=True, inplace=True)

    return sample_df

In [4]:
def performances(y_true, y_pred, y_prob):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels = [0, 1]).ravel().tolist()

    pos_acc = tp / sum(y_true)
    neg_acc = tn / (len(y_pred) - sum(y_pred)) # [y_true=0 & y_pred=0] / y_pred=0
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    
    recall = tp / (tp+fn)
    precision = tp / (tp+fp)
    f1 = 2*precision*recall / (precision+recall)
    
    roc_auc = roc_auc_score(y_true, y_prob)
    prec, reca, _ = precision_recall_curve(y_true, y_prob)
    aupr = auc(reca, prec)
    
    print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
    print('y_pred: 0 = {} | 1 = {}'.format(Counter(y_pred)[0], Counter(y_pred)[1]))
    print('y_true: 0 = {} | 1 = {}'.format(Counter(y_true)[0], Counter(y_true)[1]))
    print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}|pos_acc={:.4f}|neg_acc={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc))
    return (y_true, y_pred, y_prob), (accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc)

In [5]:
def obtain_data(directory, isbalance):

    IPE, IG = load_data(directory)

    if isbalance:
        dtp = sample(directory, random_seed = 1234)
    else:
        dtp = pd.read_csv(directory + '/all_gene_peco_pairs.csv')

    gene_ids = list(set(dtp['gene_idx']))
    peco_ids = list(set(dtp['peco_idx']))
    random.shuffle(gene_ids)
    random.shuffle(peco_ids)
    print('# gene = {} | peco = {}'.format(len(gene_ids), len(peco_ids)))

    gene_test_num = int(len(gene_ids) / 5)
    peco_test_num = int(len(peco_ids) / 5)
    print('# Test: gene = {} | peco = {}'.format(gene_test_num, peco_test_num))    
    
    samples = pd.merge(pd.merge(dtp, IPE, left_on = 'peco_idx', right_on = 'id'), IG, left_on = 'gene_idx', right_on = 'id')
    samples.drop(labels = ['id_x', 'id_y'], axis = 1, inplace = True)
    
    return IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples

In [6]:
def generate_task_Tp_train_test_idx(samples):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 1234)

    train_index_all, test_index_all, n = [], [], 0
    train_id_all, test_id_all = [], []
    fold = 0
    for train_idx, test_idx in tqdm(kf.split(samples.iloc[:, 3:])): #train_index与test_index为下标
        print('-------Fold ', fold)
        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)

        train_id_all.append(np.array(dtp.iloc[train_idx][['gene_idx', 'peco_idx']]))
        test_id_all.append(np.array(dtp.iloc[test_idx][['gene_idx', 'peco_idx']]))

        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        fold += 1
    return train_index_all, test_index_all, train_id_all, test_id_all

In [7]:
def generate_task_Tg_Tpe_train_test_idx(item, ids, dtp):
    
    test_num = int(len(ids) / 5)
    
    train_index_all, test_index_all = [], []
    train_id_all, test_id_all = [], []
    
    for fold in range(5):
        print('-------Fold ', fold)
        if fold != 4:
            test_ids = ids[fold * test_num : (fold + 1) * test_num]
        else:
            test_ids = ids[fold * test_num :]

        train_ids = list(set(ids) ^ set(test_ids))
        print('# {}: Train = {} | Test = {}'.format(item, len(train_ids), len(test_ids)))

        test_idx = dtp[dtp[item].isin(test_ids)].index.tolist()
        train_idx = dtp[dtp[item].isin(train_ids)].index.tolist()
        random.shuffle(test_idx)
        random.shuffle(train_idx)
        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        assert len(train_idx) + len(test_idx) == len(dtp)

        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)
        
        train_id_all.append(train_ids)
        test_id_all.append(test_ids)
        
    return train_index_all, test_index_all, train_id_all, test_id_all

In [8]:
def run_clf(train_index_all, test_index_all, samples, classfier):
    
    fold = 0
    for train_idx, test_idx in zip(train_index_all, test_index_all):
        print('-----------------------Fold = ', str(fold))

        X = samples.iloc[:, 3:]
        y = samples['label']

        scaler = preprocessing.MinMaxScaler().fit(X.iloc[train_idx,:])
        X = scaler.transform(X)

        x_train, y_train = X[train_idx], y[train_idx]
        x_test, y_test = X[test_idx], y[test_idx]

        if classfier == 'ERT':
            clf = ExtraTreesClassifier(random_state = 19961231)
        elif classfier == 'GNB':
            clf = GaussianNB()
        elif classfier == 'DT':
            clf = DecisionTreeClassifier(random_state = 19961231)
        elif classfier == 'SGD':
            clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=5)
        elif classfier == 'SVM':
            clf = svm.SVC(C=0.1, kernel='sigmoid', degree=3, gamma='auto', probability=True)
            
        clf.fit(x_train, y_train)

        y_train_prob = clf.predict_proba(x_train)
        y_test_prob = clf.predict_proba(x_test)

        y_train_pred = clf.predict(x_train)
        y_test_pred = clf.predict(x_test)

        print('Train:')
        ys_train, metrics_train = performances(y_train, y_train_pred, y_train_prob[:, 1])
        print('Test:')
        ys_test, metrics_test = performances(y_test, y_test_pred, y_test_prob[:, 1])

        fold += 1
    
    return ys_train, metrics_train, ys_test, metrics_test

In [9]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tp']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'ERT')
        print(ys_test[0])
        print(ys_test[2])
        fpr, tpr, thresholds = roc_curve(ys_test[0], ys_test[2])
        plt.title('ROC')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.plot(fpr, tpr, '--*b', label="ours")
        plt.legend()
        plt.show()


KeyboardInterrupt: 

In [17]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'GNB')

23558
1528
10396
86
121
1179
126
5
14
45
4992
7
67
312
853
148
29
831
45
476
2239
1
51
1
6
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 45900 | Test = 1216
-------Fold  1
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 45234 | Test = 1882
-------Fold  2
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 24204 | Test = 22912
-------Fold  3
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 44326 | Test = 2790
-------Fold  4
# peco_idx: Train = 16 | Test = 8
# Pairs: Train = 28800 | Test = 18316
-----------------------Fold =  0
Train:
tn = 16204, fp = 6617, fn = 15692, tp = 7387
y_pred: 0 = 31896 | 1 = 14004
y_true: 0 = 22821 | 1 = 23079
acc=0.5140|precision=0.5275|recall=0.3201|f1=0.3984|auc=0.5235|aupr=0.5939|pos_acc=0.3201|neg_acc=0.5080
Test:
tn = 585, fp = 152, fn = 367, tp = 112
y_pred: 0 = 952 | 1 = 264
y_true: 0 = 737 | 1 = 479
acc=0.5732|precision=0.4242|recall=0.2338|f1=0.3015|auc=0.5493|aupr=0.49

In [18]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'DT')

23558
1528
10396
86
121
1179
126
5
14
45
4992
7
67
312
853
148
29
831
45
476
2239
1
51
1
6
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 36644 | Test = 10472
-------Fold  1
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 45244 | Test = 1872
-------Fold  2
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 23874 | Test = 23242
-------Fold  3
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 41704 | Test = 5412
-------Fold  4
# peco_idx: Train = 16 | Test = 8
# Pairs: Train = 40998 | Test = 6118
-----------------------Fold =  0
Train:
tn = 17256, fp = 0, fn = 0, tp = 19388
y_pred: 0 = 17256 | 1 = 19388
y_true: 0 = 17256 | 1 = 19388
acc=1.0000|precision=1.0000|recall=1.0000|f1=1.0000|auc=1.0000|aupr=1.0000|pos_acc=1.0000|neg_acc=1.0000
Test:
tn = 1465, fp = 4837, fn = 1087, tp = 3083
y_pred: 0 = 2552 | 1 = 7920
y_true: 0 = 6302 | 1 = 4170
acc=0.4343|precision=0.3893|recall=0.7393|f1=0.5100|auc=0.4859|aupr=0.

In [9]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item = 'gene_idx'
            ids = gene_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'SGD')

23558
1528
10396
86
121
1179
126
5
14
45
4992
7
67
312
853
148
29
831
45
476
2239
1
51
1
6
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 42462 | Test = 4654
-------Fold  1
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 41670 | Test = 5446
-------Fold  2
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 45022 | Test = 2094
-------Fold  3
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 36698 | Test = 10418
-------Fold  4
# peco_idx: Train = 16 | Test = 8
# Pairs: Train = 22612 | Test = 24504
-----------------------Fold =  0
Train:
tn = 20763, fp = 118, fn = 21289, tp = 292
y_pred: 0 = 42052 | 1 = 410
y_true: 0 = 20881 | 1 = 21581
acc=0.4959|precision=0.7122|recall=0.0135|f1=0.0266|auc=0.5041|aupr=0.6158|pos_acc=0.0135|neg_acc=0.4937
Test:
tn = 2666, fp = 11, fn = 1967, tp = 10
y_pred: 0 = 4633 | 1 = 21
y_true: 0 = 2677 | 1 = 1977
acc=0.5750|precision=0.4762|recall=0.0051|f1=0.0100|auc=0.5003|aupr=0.4412

In [11]:
directory = 'D:\小麦\MDA-GCNFTG-main\MDA-GCNFTG-main\data'
for isbalance in [True]:
    
    IPE, IG, dtp, gene_ids, peco_ids, gene_test_num, peco_test_num, samples = obtain_data(directory, 
                                                                                                 isbalance)
    for task in ['Tpe']:
        
        print('========== isbalance = {} | task = {}'.format(isbalance, task))
        
        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(samples)
            
        elif task == 'Tg':
            item =                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        elif task == 'Tpe':
            item = 'peco_idx'
            ids = peco_ids
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tg_Tpe_train_test_idx(item, ids, dtp)

        ys_train, metrics_train, ys_test, metrics_test = run_clf(train_index_all, test_index_all, samples, 'SVM')

23558
1528
10396
86
121
1179
126
5
14
45
4992
7
67
312
853
148
29
831
45
476
2239
1
51
1
6
# gene = 11177 | peco = 24
# Test: gene = 2235 | peco = 4
-------Fold  0
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 24546 | Test = 22570
-------Fold  1
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 35356 | Test = 11760
-------Fold  2
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 46486 | Test = 630
-------Fold  3
# peco_idx: Train = 20 | Test = 4
# Pairs: Train = 42536 | Test = 4580
-------Fold  4
# peco_idx: Train = 16 | Test = 8
# Pairs: Train = 39540 | Test = 7576
-----------------------Fold =  0


KeyboardInterrupt: 

In [1]:
print('hello')

hello
