In [2]:
import numpy as np 
import networkx as nx 
import os 
from itertools import chain
import keras 

In [9]:
def parse_arff(arff_file, is_GO=False, is_test=False):
    with open(arff_file) as f:
        read_data = False
        X = []
        Y = []
        g = nx.DiGraph()
        feature_types = []
        d = []
        cats_lens = []
        for num_line, l in enumerate(f):
            if l.startswith('@ATTRIBUTE'):
                if l.startswith('@ATTRIBUTE class'):
                    h = l.split('hierarchical')[1].strip()
                    for branch in h.split(','):
                        terms = branch.split('/')
                        if is_GO:
                            g.add_edge(terms[1], terms[0])
                        else:
                            if len(terms)==1:
                                g.add_edge(terms[0], 'root')
                            else:
                                for i in range(2, len(terms) + 1):
                                    g.add_edge('.'.join(terms[:i]), '.'.join(terms[:i-1]))
                    nodes = sorted(g.nodes(), key=lambda x: (nx.shortest_path_length(g, x, 'root'), x) if is_GO else (len(x.split('.')),x))
                    nodes_idx = dict(zip(nodes, range(len(nodes))))
                    g_t = g.reverse()
                else:
                    _, f_name, f_type = l.split()
                    
                    if f_type == 'numeric' or f_type == 'NUMERIC':
                        d.append([])
                        cats_lens.append(1)
                        feature_types.append(lambda x,i: [float(x)] if x != '?' else [np.nan])
                        
                    else:
                        cats = f_type[1:-1].split(',')
                        cats_lens.append(len(cats))
                        d.append({key:keras.utils.to_categorical(i, len(cats)).tolist() for i,key in enumerate(cats)})
                        feature_types.append(lambda x,i: d[i].get(x, [0.0]*cats_lens[i]))
            elif l.startswith('@DATA'):
                read_data = True
            elif read_data:
                y_ = np.zeros(len(nodes))
                d_line = l.split('%')[0].strip().split(',')
                lab = d_line[len(feature_types)].strip()
                
                X.append(list(chain(*[feature_types[i](x,i) for i, x in enumerate(d_line[:len(feature_types)])])))
                
                for t in lab.split('@'): 
                    y_[[nodes_idx.get(a) for a in nx.ancestors(g_t, t.replace('/', '.'))]] =1
                    y_[nodes_idx[t.replace('/', '.')]] = 1
                Y.append(y_)
        X = np.array(X)
        Y = np.stack(Y)

    return X, Y, nodes, g， np.array(nx.to_numpy_matrix(g, nodelist=nodes, order=nodes))


In [11]:
file_name = 'e:/codes/C-HMCNN/HMC_data/datasets_FUN/seq_FUN/seq_FUN.train.arff'
X, Y, nodes, g = parse_arff(file_name)

In [15]:
X.shape

(1701, 529)

In [14]:
Y.shape

(1701, 500)

In [28]:
nodes

['01',
 '02',
 '10',
 '11',
 '12',
 '14',
 '16',
 '18',
 '20',
 '30',
 '32',
 '34',
 '38',
 '40',
 '41',
 '42',
 '43',
 '99',
 'root',
 '01.01',
 '01.02',
 '01.03',
 '01.04',
 '01.05',
 '01.06',
 '01.07',
 '01.20',
 '01.25',
 '02.01',
 '02.04',
 '02.07',
 '02.08',
 '02.10',
 '02.11',
 '02.13',
 '02.16',
 '02.19',
 '02.25',
 '02.45',
 '10.01',
 '10.03',
 '11.02',
 '11.04',
 '11.06',
 '12.01',
 '12.04',
 '12.07',
 '12.10',
 '14.01',
 '14.04',
 '14.07',
 '14.10',
 '14.13',
 '16.01',
 '16.02',
 '16.03',
 '16.06',
 '16.07',
 '16.09',
 '16.11',
 '16.13',
 '16.17',
 '16.19',
 '16.21',
 '18.01',
 '18.02',
 '20.01',
 '20.03',
 '20.09',
 '30.01',
 '30.05',
 '32.01',
 '32.05',
 '32.07',
 '32.10',
 '34.01',
 '34.07',
 '34.11',
 '38.07',
 '40.01',
 '40.10',
 '40.20',
 '41.01',
 '42.01',
 '42.02',
 '42.03',
 '42.04',
 '42.05',
 '42.07',
 '42.08',
 '42.09',
 '42.10',
 '42.16',
 '42.19',
 '42.22',
 '42.25',
 '42.27',
 '42.29',
 '43.01',
 '01.01.03',
 '01.01.05',
 '01.01.06',
 '01.01.09',
 '01.01.11',


In [17]:
g 

<networkx.classes.digraph.DiGraph at 0x18964434508>

In [32]:
x1 = np.array(nx.to_numpy_matrix(g, nodelist=nodes, order='C'))
x2 = np.array(nx.to_numpy_matrix(g, nodelist=nodes))
np.sum(x1 == x2) 

250000

In [65]:
idx = [(i,j) for (i,j) in list(enumerate(nodes)) if '01.01' in j]
idx 

[(19, '01.01'),
 (99, '01.01.03'),
 (100, '01.01.05'),
 (101, '01.01.06'),
 (102, '01.01.09'),
 (103, '01.01.11'),
 (104, '01.01.13'),
 (141, '02.01.01'),
 (174, '12.01.01'),
 (205, '18.01.01'),
 (211, '20.01.01'),
 (242, '32.01.01'),
 (258, '34.01.01'),
 (269, '41.01.01'),
 (277, '01.01.03.01'),
 (278, '01.01.03.02'),
 (279, '01.01.03.03'),
 (280, '01.01.03.05'),
 (281, '01.01.05.01'),
 (282, '01.01.05.03'),
 (283, '01.01.06.01'),
 (284, '01.01.06.02'),
 (285, '01.01.06.04'),
 (286, '01.01.06.05'),
 (287, '01.01.06.06'),
 (288, '01.01.09.01'),
 (289, '01.01.09.02'),
 (290, '01.01.09.03'),
 (291, '01.01.09.04'),
 (292, '01.01.09.05'),
 (293, '01.01.09.06'),
 (294, '01.01.09.07'),
 (295, '01.01.11.01'),
 (296, '01.01.11.02'),
 (297, '01.01.11.03'),
 (298, '01.01.11.04'),
 (305, '01.03.01.01'),
 (329, '01.20.01.01'),
 (345, '10.03.01.01'),
 (365, '14.13.01.01'),
 (367, '18.02.01.01'),
 (369, '20.01.01.01'),
 (370, '20.01.01.07'),
 (373, '20.03.01.01'),
 (395, '32.05.01.01'),
 (403, '34.0

In [67]:
x1[[99,100,101,102,103,104,105]][:,19]

array([1., 1., 1., 1., 1., 1., 0.])