# set dataset and undirected/directed

In [1]:
alp = "b"
from tqdm import tqdm

# load dataset

In [2]:
import scipy.io

alp2data = {"a": "iJO1366", "b": "iAF1260b", "c": "iAF692", "d": "iHN637"}
datafile = "./metabolic_reactions/" + alp2data[alp] + "/" + alp2data[alp] + ".mat"
loaded = scipy.io.loadmat(datafile)

# indices of hyperlinks and candidates in data

In [3]:
if alp == "a" or alp == "c" or alp == "d":
    hi = 10
    ci = 17
elif alp == "b":
    hi = 11
    ci = 18

# get hyperlinks and candidates

In [4]:
import numpy as np

data = loaded['Model'][0][0]
hyperlinks = np.array(data[hi].toarray())
candidates = data[ci]

In [5]:
hyperlinks.shape

(1668, 2388)

In [6]:
candidates.shape

(1668, 3630)

# get the directed hypergraph for a stoichiometric matrix

In [7]:
def getFilteredStoMatrix(stoichiometric_matrix):
    smT = []
    for j in tqdm(range(stoichiometric_matrix.shape[1])):
        pos, neg = [], []
        for i in range(stoichiometric_matrix.shape[0]):
            entry = stoichiometric_matrix[i][j]
            if entry < 0:
                neg.append(i)
            elif entry > 0:
                pos.append(i)
        n, p = len(neg), len(pos)
        if n > 1 and p > 1:
            smT.append(stoichiometric_matrix[:,j])
    sm = np.transpose(np.array(smT))
    return sm

def getNonZeroRows(hyperlinks, candidates):
    filt_hyperlinks = []
    filt_candidates = []

    for i, row in tqdm(enumerate(candidates)):
        if np.any(row):
            filt_candidates.append(row)
            filt_hyperlinks.append(hyperlinks[i])
            
    return np.array(filt_hyperlinks), np.array(filt_candidates)

hyperlinks = getFilteredStoMatrix(hyperlinks)
candidates = getFilteredStoMatrix(candidates)

hyperlinks, candidates = getNonZeroRows(hyperlinks, candidates)

100%|██████████| 2388/2388 [00:02<00:00, 1116.48it/s]
100%|██████████| 3630/3630 [00:03<00:00, 1115.19it/s]
1668it [00:00, 40260.92it/s]


In [8]:
print(hyperlinks.shape)
print(candidates.shape)

(1542, 1544)
(1542, 2588)


In [9]:
def getDirectedHypergraph(stoichiometric_matrix):
    dic_dir = {}
    for j in tqdm(range(stoichiometric_matrix.shape[1])):
        pos, neg = [], []
        for i in range(stoichiometric_matrix.shape[0]):
            entry = stoichiometric_matrix[i][j]
            if entry < 0:
                neg.append(i)
            elif entry > 0:
                pos.append(i)
        n, p = len(neg), len(pos)
        assert(n > 1 and p > 1)
        k = str(j)
        dic_dir[k] = [neg,pos]
    return  dic_dir

dhyperlinks = getDirectedHypergraph(hyperlinks)
dcandidates = getDirectedHypergraph(candidates)

100%|██████████| 1544/1544 [00:01<00:00, 1183.64it/s]
100%|██████████| 2588/2588 [00:02<00:00, 1183.03it/s]


In [10]:
dhyperlinks

{'0': [[19, 415, 863], [18, 317, 868, 1227]],
 '1': [[57, 870], [56, 868]],
 '2': [[55, 865], [213, 870]],
 '3': [[53, 865], [156, 870]],
 '4': [[52, 865], [152, 870]],
 '5': [[54, 865], [160, 870]],
 '6': [[73, 415, 563], [359, 1170, 1244]],
 '7': [[75, 415, 1449], [359, 1172, 1244]],
 '8': [[77, 415, 1452], [359, 1174, 1244]],
 '9': [[79, 415, 875], [359, 1176, 1244]],
 '10': [[81, 415, 878], [359, 1178, 1244]],
 '11': [[83, 415, 1124], [359, 1180, 1244]],
 '12': [[85, 415, 1127], [359, 1182, 1244]],
 '13': [[87, 415, 563], [359, 1186, 1244]],
 '14': [[89, 415, 1449], [359, 1188, 1244]],
 '15': [[91, 415, 1452], [359, 1190, 1244]],
 '16': [[93, 415, 875], [359, 1192, 1244]],
 '17': [[95, 415, 878], [359, 1194, 1244]],
 '18': [[97, 415, 1124], [359, 1196, 1244]],
 '19': [[99, 415, 1127], [359, 1198, 1244]],
 '20': [[114, 868, 1084], [787, 1083]],
 '21': [[114, 868, 1086], [787, 1085]],
 '22': [[115, 868, 1084], [916, 1083]],
 '23': [[115, 868, 1086], [916, 1085]],
 '24': [[126, 863], 

In [11]:
len(dhyperlinks.keys())

1544

In [12]:
len(dcandidates.keys())

2588

In [13]:
dcandidates_list = []
filt_dcandidates = {}

for k in dcandidates:
    link = dcandidates[k]
    
    if link not in dcandidates_list:
        dcandidates_list.append(link)
        filt_dcandidates[k] = link
dcandidates = filt_dcandidates.copy()

In [14]:
len(dcandidates.keys())

2513

# get hits and misses of hyperlinks in candidates

In [15]:
import collections
from tqdm import tqdm

def getHitsandMisses(dhyperlinks, dcandidates):
    compare = lambda x, y: collections.Counter(x) == collections.Counter(y)
    ckeys = list(dcandidates.keys())
    hits, misses, flag = [], [], [0]*len(ckeys)
    
    print(len(dcandidates))
    print(len(dhyperlinks))

    print("getting all indices of hyperlinks in candidates")
    for K in tqdm(dhyperlinks.keys()):
        DHL = dhyperlinks[K]
        T, H = DHL[0], DHL[1]

        for i, k in enumerate(ckeys):
            dhl = dcandidates[k]
            t, h = dhl[0], dhl[1]
            if compare(t, T):
                if compare(h, H):
                    hits.append([K,k])
                    flag[i] = 1
    print(len(flag))
    print(sum(flag))
    for i in range(len(ckeys)):
        if flag[i] == 0:
            misses.append(ckeys[i])
    
    return hits, misses

hits, misses =  getHitsandMisses(dhyperlinks, dcandidates)          

  0%|          | 7/1544 [00:00<00:24, 62.00it/s]

2513
1544
getting all indices of hyperlinks in candidates


100%|██████████| 1544/1544 [00:24<00:00, 62.46it/s]

2513
1526





# sanity check

In [16]:
def sanityCheck(dhyperlinks, dcandidates, hits, misses):
    compare = lambda x, y: collections.Counter(x) == collections.Counter(y)
    flag = True
    
    for m in tqdm(misses):
        DHL = dcandidates[m]
        T, H = DHL[0], DHL[1]

        for k in dhyperlinks.keys():
            dhl = dhyperlinks[k]
            t, h = dhl[0], dhl[1]
            if compare(t, T) and compare(h, H):
                flag = False
    
    for pair in tqdm(hits):
        k = pair[1]
        DHL = dcandidates[k]
        T, H = DHL[0], DHL[1]
        notFound = True

        for k in dhyperlinks.keys():

            dhl = dhyperlinks[k]
            t, h = dhl[0], dhl[1]
            if compare(t, T) and compare(h, H):
                notFound = False
                break
        if notFound:
              flag = False
    
    if flag:
        print("sanity check successfull")
    else:
        print("failed!")

sanityCheck(dhyperlinks, dcandidates, hits, misses)

100%|██████████| 987/987 [00:09<00:00, 102.28it/s]
100%|██████████| 1544/1544 [00:07<00:00, 204.81it/s]

sanity check successfull





In [17]:
def appendColumn(matrix, V, hyperedge):
    column = np.zeros(V)
    for node in hyperedge:
        column[node] = 1
    if not any(np.array_equal(column, m) for m in matrix):
        matrix.append(column)
    ind = -1
    for i in range(0, len(matrix)):
        if np.array_equal(matrix[i], column):
            ind = i
            break
    return matrix, ind

# construct S and U matrices

In [18]:
def getHyperlinksAndCandidates(dhyperlinks, dcandidates, hits, misses):
    V = candidates.shape[0]
    compare = lambda x, y: collections.Counter(x) == collections.Counter(y)
    ST = [] # S transpose
    UT = [] # U transpose
    candidate_dir = []
    hyperlink_dir = []

    for pair in tqdm(hits):
        K, k = pair[0], pair[1]
        DHL, dhl = dhyperlinks[K], dcandidates[k]
        T, H, t, h = DHL[0], DHL[1], dhl[0], dhl[1]

        assert(compare(T,t) and compare(H,h))
        
        ST, indsT = appendColumn(ST, V, T)
        ST, indsH = appendColumn(ST, V, H)

        UT, induT = appendColumn(UT, V, t)
        UT, induH = appendColumn(UT, V, h)
        
        assert(indsT == induT and indsH == induH)
        
        candidate_dir.append((induT, induH))
        hyperlink_dir.append((indsT, indsH))

    for m in tqdm(misses):
        dhl = dcandidates[m]
        t, h = dhl[0], dhl[1]
        UT, induT = appendColumn(UT, V, t)
        UT, induH = appendColumn(UT, V, h)
    
        candidate_dir.append((induT, induH))

    return ST, UT, hyperlink_dir, candidate_dir

ST, UT, hyperlink_dir, candidate_dir = getHyperlinksAndCandidates(dhyperlinks, dcandidates, hits, misses)

100%|██████████| 1544/1544 [01:33<00:00, 16.54it/s]
100%|██████████| 987/987 [01:08<00:00, 14.51it/s]


In [19]:
dcandidates

{'0': [[19, 415, 863], [18, 317, 868, 1227]],
 '1': [[57, 870], [56, 868]],
 '2': [[55, 865], [213, 870]],
 '3': [[53, 865], [156, 870]],
 '4': [[52, 865], [152, 870]],
 '5': [[54, 865], [160, 870]],
 '6': [[73, 415, 563], [359, 1170, 1244]],
 '7': [[75, 415, 1449], [359, 1172, 1244]],
 '8': [[77, 415, 1452], [359, 1174, 1244]],
 '9': [[79, 415, 875], [359, 1176, 1244]],
 '10': [[81, 415, 878], [359, 1178, 1244]],
 '11': [[83, 415, 1124], [359, 1180, 1244]],
 '12': [[85, 415, 1127], [359, 1182, 1244]],
 '13': [[87, 415, 563], [359, 1186, 1244]],
 '14': [[89, 415, 1449], [359, 1188, 1244]],
 '15': [[91, 415, 1452], [359, 1190, 1244]],
 '16': [[93, 415, 875], [359, 1192, 1244]],
 '17': [[95, 415, 878], [359, 1194, 1244]],
 '18': [[97, 415, 1124], [359, 1196, 1244]],
 '19': [[99, 415, 1127], [359, 1198, 1244]],
 '20': [[114, 868, 1084], [787, 1083]],
 '21': [[114, 868, 1086], [787, 1085]],
 '22': [[115, 868, 1084], [787, 1083]],
 '23': [[115, 868, 1086], [787, 1085]],
 '24': [[115, 868, 1

In [20]:
len(hyperlink_dir)

1544

In [21]:
len(candidate_dir)

2531

In [22]:
import random
def addNegativeHyperlinks(UT):
    V = candidates.shape[0]
    NHL = len(UT) # number of negative hyperlinks
    print("adding", NHL, "negative hyperlinks")
    for i in range(NHL):
        size = np.random.randint(3,5)
        sample = random.sample(range(V), size)
        UT, _ = appendColumn(UT, V, sample)
    return UT

UT = addNegativeHyperlinks(UT)

S = np.transpose(np.array(ST))
U = np.transpose(np.array(UT))

adding 4411 negative hyperlinks


In [23]:
zero_indices = []
for i in range(U.shape[0]):
    if U[i,:].sum() == 0:
        zero_indices.append(i)

'''
SF = np.delete(S, zero_indices, 0)
UF = np.delete(U, zero_indices, 0)
'''

'\nSF = np.delete(S, zero_indices, 0)\nUF = np.delete(U, zero_indices, 0)\n'

In [24]:
len(zero_indices)

0

In [25]:
S.shape

(1542, 2930)

In [26]:
U.shape

(1542, 8822)

In [27]:
c = 0
compare = lambda x, y: collections.Counter(x) == collections.Counter(y)
UT = np.transpose(U)
UT = UT.tolist()
print(len(UT))

8822


In [28]:
import pickle


with open(alp2data[alp] + '_hyperlinks.pkl', 'wb') as handle:
    pickle.dump(S, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(alp2data[alp] + '_candidates.pkl', 'wb') as handle:
    pickle.dump(U, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(alp2data[alp] + '_directions_hyperlinks.pkl', 'wb') as handle:
    pickle.dump(hyperlink_dir, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(alp2data[alp] + '_directions_candidates.pkl', 'wb') as handle:
    pickle.dump(candidate_dir, handle, protocol=pickle.HIGHEST_PROTOCOL)

# "r" run all cells

In [29]:
get_ipython().run_cell_magic('javascript', '', "\nJupyter.keyboard_manager.command_shortcuts.add_shortcut('r', {\n    help : 'run all cells',\n    help_index : 'zz',\n    handler : function (event) {\n        IPython.notebook.execute_all_cells();\n        return false;\n    }}\n);")

<IPython.core.display.Javascript object>