In [619]:
import os
import numpy as np
import pandas as pd
import networkx as nx

In [620]:
ds_path = "./facebook"

In [621]:
def gender_extract(feat_arr):
    """
    assuming array size is always 2
    """
    conds = np.where(feat_arr == 1)[0]
    if len(conds) == 0:
        return 0
    else:
        return conds[0]+1
    

In [622]:
def find_feature_numbers(featfilename, features):
    print("~~~Reading feat file : {}, for features : {}".format(featfilename,features))
    feat2num = dict() # {feature:[no1, no2, ..]}
    featfilepath = os.path.join(ds_path,featfilename)
    with open(featfilepath, "r") as f:
        content = f.readlines()
        for line in content:
            feature = [feature for feature in features if feature in line]
            if feature: feature = feature[0]
            else: continue
            if feature not in feat2num: feat2num[feature] = list()
            
            feature_idx = int(line.split()[0])
            feat2num[feature].append(feature_idx)
    return feat2num, len(content)
    

In [628]:
def find_idx(item,list_):
    try:
        return list_.index(item)
    except:
        return -1

def read_features(featfile,feat2num,feat_len):
    egonode = int(featfile.split(".")[0])
    featfile = os.path.join(ds_path, featfile)
    egofeatfile = os.path.join(ds_path, str(egonode)+".egofeat")
    featureidxs = feat2num.keys()
    nodedict = dict()
    if egonode not in nodedict: nodedict[egonode] = dict()
    with open(egofeatfile, "r") as fego:
        content_ego = fego.read().split()
        content_ego = np.array([int(item) for item in content_ego])
    with open(featfile, "r") as f:
        content = f.read().split()
        content = np.array([int(item) for item in content])
        start_node = content[0]
        next_idx = 0 
        while next_idx < len(content):
              if next_idx >= len(content): break  
             
              node = content[next_idx] 
              features = np.array(content[next_idx+1:next_idx+1+feat_len])
              if node not in nodedict: nodedict[node] = dict()  
              for feat_name, feat_idxs in feat2num.items():
                  nodedict[node][feat_name] = features[feat_idxs] 
                  if feat_name == "gender": feat_extract_ =  gender_extract
                  nodedict[node][feat_name] = feat_extract_(features[feat_idxs])
                  if egonode in nodedict and feat_name in nodedict[egonode]: pass
                  else: nodedict[egonode][feat_name] = feat_extract_(content_ego[feat_idxs])
              next_idx +=  (feat_len+1)
    return nodedict
              
              

In [629]:
def read_feat_file(featfile, featfilename, features):
    # find feature numbers
    feat2num, feat_len = find_feature_numbers(featfilename,features)
    node_dict = read_features(featfile, feat2num,feat_len)   
    return node_dict

In [630]:
def read_feature_files(feat_extract=["gender"]):
    """
    Takes a list of features to extract
    Return in format: {"nodeid": {"feature": feature_value}}
    
    
    # convert binary encoded feature in integer class
    """
    all_node_dict = dict()
    feat_files  = [(file, file.split(".")[0]+".featnames") for file in os.listdir(ds_path) if file.endswith(".feat")]
    for feat_file, feat_file_name in feat_files:
        node_dict = read_feat_file(feat_file, feat_file_name, feat_extract)
        all_node_dict.update(node_dict)
    return all_node_dict
        

In [631]:
node_dict = read_feature_files(feat_extract=["gender"])

~~~Reading feat file : 3980.featnames, for features : ['gender']
~~~Reading feat file : 414.featnames, for features : ['gender']
~~~Reading feat file : 1912.featnames, for features : ['gender']
~~~Reading feat file : 107.featnames, for features : ['gender']
~~~Reading feat file : 1684.featnames, for features : ['gender']
~~~Reading feat file : 686.featnames, for features : ['gender']
~~~Reading feat file : 3437.featnames, for features : ['gender']
~~~Reading feat file : 0.featnames, for features : ['gender']
~~~Reading feat file : 348.featnames, for features : ['gender']
~~~Reading feat file : 698.featnames, for features : ['gender']


In [632]:
node_dict

{3980: {'gender': 2},
 3981: {'gender': 1},
 3982: {'gender': 2},
 3983: {'gender': 2},
 3984: {'gender': 2},
 3985: {'gender': 2},
 3986: {'gender': 2},
 3987: {'gender': 1},
 3988: {'gender': 2},
 3989: {'gender': 2},
 3990: {'gender': 1},
 3991: {'gender': 2},
 3992: {'gender': 1},
 3993: {'gender': 2},
 3994: {'gender': 1},
 3995: {'gender': 2},
 594: {'gender': 2},
 3996: {'gender': 2},
 3997: {'gender': 2},
 3998: {'gender': 2},
 3999: {'gender': 2},
 4000: {'gender': 2},
 4001: {'gender': 0},
 4002: {'gender': 2},
 4003: {'gender': 2},
 4004: {'gender': 2},
 4005: {'gender': 2},
 4006: {'gender': 2},
 4007: {'gender': 2},
 4008: {'gender': 2},
 4009: {'gender': 2},
 4010: {'gender': 2},
 4011: {'gender': 2},
 4012: {'gender': 1},
 4013: {'gender': 1},
 4014: {'gender': 2},
 4015: {'gender': 2},
 4016: {'gender': 2},
 4017: {'gender': 2},
 4018: {'gender': 1},
 4019: {'gender': 2},
 4020: {'gender': 0},
 4021: {'gender': 2},
 4022: {'gender': 1},
 4023: {'gender': 2},
 4024: {'ge

In [633]:
# read edges

In [634]:
def get_edges():
    edges = pd.read_csv(
    os.path.join(ds_path, "facebook_combined.txt"),
    sep=" ",
    names=["start_node", "end_node"])
    return edges

In [635]:
edges = get_edges()
nodes = [(node,dict_) for node, dict_ in node_dict.items()]

In [636]:
g = nx.from_pandas_edgelist(edges, "start_node", "end_node")
g.add_nodes_from(nodes)

In [637]:
node_attr = nx.get_node_attributes(g,"gender")

In [638]:
edge_dict = dict()
for u,v in g.edges():
    
    label = "{}->{}".format(node_attr[u],node_attr[v])
    if label not in edge_dict: edge_dict[label] = 0
    edge_dict[label] += 1

In [639]:
edge_dict

{'2->1': 20954,
 '2->2': 32502,
 '2->0': 768,
 '1->1': 15584,
 '1->2': 16442,
 '1->0': 437,
 '0->1': 709,
 '0->2': 795,
 '0->0': 43}

In [640]:
nx.write_gpickle(g, 'fb_gender.gpickle')