In [1]:
import pandas as pd
import os
import collections
import random
import logging

In [2]:
edges = pd.read_csv('data/hetnet_baseedges.csv')

In [3]:
edges.head(2)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,Q71969,Q221668,medical-condition-treated_CmD
1,Q71969,Q246084,medical-condition-treated_CmD


In [4]:
def get_pair_lists(edges):
    # Get edge types
    edge_types = set(edges[':TYPE'])
    
    # Iterate over types and get a pair list for each
    pair_list_dict = dict()
    for kind in edge_types:
        edges_of_type = edges[edges[':TYPE'] == kind]

        start = list(edges_of_type[':START_ID'])
        end = list(edges_of_type[':END_ID'])
        pair_list = [(s, e) for s, e in zip(start, end)]


        pair_list_dict[kind] = pair_list
    
    return pair_list_dict


In [5]:
def permute_edges(edges, multiplier=10, seed=0):

    pair_list_dict = get_pair_lists(edges)
    
    # Container for Stats
    all_stats = list()
        
    # Permute from the pair list
    permuted_pair_lists = dict()
    for kind, pair_list in pair_list_dict.items():
        permuted_pair_list, stats = permute_pair_list(pair_list, directed = True, multiplier = multiplier, seed = seed)
        permuted_pair_lists[kind] = permuted_pair_list
        for stat in stats:
            stat['metaedge'] = kind
            stat['abbrev'] = kind.split('_')[-1]
        all_stats.extend(stats)
        
    # Return to a dataFrame
    return pair_list_to_df(permuted_pair_lists), all_stats
        

In [6]:
def permute_pair_list(pair_list, directed=False, multiplier=10, excluded_pair_set=set(), seed=0, log=False):
    """
    If n_perm is not specific, perform 10 times the number of edges of permutations
    May not work for directed edges
    """
    random.seed(seed)

    pair_set = set(pair_list)
    assert len(pair_set) == len(pair_list)

    edge_number = len(pair_list)
    n_perm = int(edge_number * multiplier)

    count_same_edge = 0
    count_self_loop = 0
    count_duplicate = 0
    count_undir_dup = 0
    count_excluded = 0

    if log:
        logging.info('{} edges, {} permutations (seed = {}, directed = {}, {} excluded_edges)'.format(
            edge_number, n_perm, seed, directed, len(excluded_pair_set)))

    orig_pair_set = pair_set.copy()
    step = max(1, n_perm // 10)
    print_at = list(range(step, n_perm, step)) + [n_perm - 1]

    stats = list()
    for i in range(n_perm):

        # Same two random edges
        i_0 = random.randrange(edge_number)
        i_1 = random.randrange(edge_number)

        # Same edge selected twice
        if i_0 == i_1:
            count_same_edge += 1
            continue
        pair_0 = pair_list.pop(i_0)
        pair_1 = pair_list.pop(i_1 - 1 if i_0 < i_1 else i_1)

        new_pair_0 = pair_0[0], pair_1[1]
        new_pair_1 = pair_1[0], pair_0[1]

        valid = False
        for pair in new_pair_0, new_pair_1:
            if pair[0] == pair[1]:
                count_self_loop += 1
                break  # edge is a self-loop
            if pair in pair_set:
                count_duplicate += 1
                break  # edge is a duplicate
            if not directed and (pair[1], pair[0]) in pair_set:
                count_undir_dup += 1
                break  # edge is a duplicate
            if pair in excluded_pair_set:
                count_excluded += 1
                break  # edge is excluded
        else:
            # edge passed all validity conditions
            valid = True

        # If new edges are invalid
        if not valid:
            for pair in pair_0, pair_1:
                pair_list.append(pair)

        # If new edges are valid
        else:
            for pair in pair_0, pair_1:
                pair_set.remove(pair)
            for pair in new_pair_0, new_pair_1:
                pair_set.add(pair)
                pair_list.append(pair)

        if i in print_at:
            stat = collections.OrderedDict()
            stat['cumulative_attempts'] = i
            index = print_at.index(i)
            stat['attempts'] = print_at[index] + 1 if index == 0 else print_at[index] - print_at[index - 1]
            stat['complete'] = (i + 1) / n_perm
            stat['unchanged'] = len(orig_pair_set & pair_set) / len(pair_set)
            stat['same_edge'] = count_same_edge / stat['attempts']
            stat['self_loop'] = count_self_loop / stat['attempts']
            stat['duplicate'] = count_duplicate / stat['attempts']
            stat['undirected_duplicate'] = count_undir_dup / stat['attempts']
            stat['excluded'] = count_excluded / stat['attempts']
            stats.append(stat)

            count_same_edge = 0
            count_self_loop = 0
            count_duplicate = 0
            count_undir_dup = 0
            count_excluded = 0

    assert len(pair_set) == edge_number
    return pair_list, stats

In [7]:
# Convert IDPairs to edges
def pair_list_to_df(pair_list_dict):
    
    td = dict()
    td[':START_ID'] = []
    td[':END_ID'] = []
    td[':TYPE'] = []
    for key, values in pair_list_dict.items():
        td[':START_ID'] += [v[0] for v in values]
        td[':END_ID'] += [v[1] for v in values]
        td[':TYPE'] += [key]*len(values)
    
    return pd.DataFrame(td)

In [None]:
n_perms = 5
multiplier = 5

In [None]:
%%time
stat_dfs = list()
permuted_edges = edges
 
for i in range(n_perms):
    i += 1
    print('Starting permutation', i)
    permuted_edges, stats = permute_edges(permuted_edges, multiplier=multiplier, seed=i)
    stat_df = pd.DataFrame(stats)
    stat_df['permutation'] = i
    stat_dfs.append(stat_df)
    path = os.path.join('data', 'permuted', 'hetnet_basedges_perm-{}.csv'.format(i))
    permuted_edges.to_csv(path, index = False)

# Save stats
stat_df = pd.concat(stat_dfs)
path = os.path.join('data', 'permuted', 'stats.tsv')
stat_df.to_csv(path, sep='\t', index=False, float_format='%.5g')

Starting permutation 1
Starting permutation 2
