# Preprocessing Script 
In this script we calculate the feature vectors for all graphs that have before been created from 
projects' data in the ghtorrent data base (see script scrape_pr_graphs_from_ghtorrent.py)



In [1]:
import networkx as nx
import pickle
from os import listdir
import numpy as np
from scipy import stats

In [2]:
# load networks from pickle file
def pickleLoader(pklFile):
    try:
        while True:
            yield nx.read_gpickle(pklFile)
    except EOFError:
        pass

In [None]:
_eps = 10**(-10)

def get_features(A):
    """Feature grabber for DiNetSimile algorithm. Features used are

        1. In-Degree of node
        2. Out-Degree of node
        3. Clustering coefficient of node
        4. Average clustering coefficient of predecessors of node
        5. Average clustering coefficient of successors of node
        6. Average number of successors of succesors of node
        7. Average number of successors of predecessors of node
        8. Average number of predecessors of succesors of node
        9. Average number of predecessors of predecessors of node
        10. Number of edges in successor egonet of node
        11. Number of edges in predecessor egonet of node
        12. reciprocity of in/out edges of node
    
    Parameters
    ---------
    A : NumPy matrix
        Adjacency matrix of graph in question. Preferably a SciPy sparse matrix
        for large graphs.

    Returns
    -------
    feature_mat : NumPy array
        An n by 12 array of features, where n = A.shape[0]

    """
    try:
        G = nx.from_scipy_sparse_matrix(A, create_using=nx.DiGraph())
    except AttributeError:
        G = nx.from_numpy_matrix(A, create_using=nx.DiGraph())       
    n = len(G)

    #number of in/out neighbors
    neighbors_suc = [list(set(G.successors(i))-{i}) for i in range(n)]
    neighbors_pre = [list(set(G.predecessors(i))-{i}) for i in range(n)]
    d_vec_out = np.array([len(suc) for suc in neighbors_suc])
    d_vec_in = np.array([len(pre) for pre in neighbors_pre])

    # average in/out degree of neighbors (successors) (0 if node is isolated)
    neighbor_suc_deg_in = [d_vec_in[neighbors_suc[i]].sum()/d_vec_out[i]
                    if d_vec_out[i]>_eps else 0 for i in range(n)]
    neighbor_suc_deg_out = [d_vec_out[neighbors_suc[i]].sum()/d_vec_out[i]
                    if d_vec_out[i]>_eps else 0 for i in range(n)]

    # average in/out degree of neighbors (predecessors) (0 if node is isolated)
    neighbor_pre_deg_in = [d_vec_in[neighbors_pre[i]].sum()/d_vec_in[i]
                    if d_vec_in[i]>_eps else 0 for i in range(n)]
    neighbor_pre_deg_out = [d_vec_out[neighbors_pre[i]].sum()/d_vec_in[i]
                    if d_vec_in[i]>_eps else 0 for i in range(n)]

    #clustering coefficient for all nodes
    clust_vec = np.array(list(nx.clustering(G).values()))

    #average clustering coefficient for predecessors and successors
    neighbor_clust_pre = [clust_vec[neighbors_pre[i]].sum()/d_vec_in[i] 
                      if d_vec_in[i]>_eps else 0 for i in range(n)]
    neighbor_clust_suc = [clust_vec[neighbors_suc[i]].sum()/d_vec_out[i] 
                      if d_vec_out[i]>_eps else 0 for i in range(n)]


    # egonets predecessors and successors
    egonets_suc = [nx.ego_graph(G,i) for i in range(n)]
    egonets_pre = [nx.ego_graph(G.reverse(),i) for i in range(n)]

    # number of edges in egonet
    ego_size_suc = [D.size() for D in egonets_suc]
    ego_size_pre = [D.size() for D in egonets_pre]


    #reciprocity - selfloops are ignored and isolate notes are given reciprocity zeroo
    G.remove_edges_from(G.selfloop_edges())
    recipro_dict=nx.algorithms.reciprocity(G,  nodes=[node for node in G])
    recipro=[]
    for k,v in recipro_dict.items():
        if v==None:
            recipro.append(0.0)
        else:
            recipro.append(v)


    # use mat.T so that each node is a row (standard format)
    feature_mat = np.array([d_vec_in, d_vec_out, clust_vec, neighbor_suc_deg_in, neighbor_suc_deg_out,
                            neighbor_pre_deg_in, neighbor_pre_deg_out, neighbor_clust_pre, neighbor_clust_suc,
                            ego_size_pre, ego_size_suc, recipro]).T 

    return feature_mat




def aggregate_features(feature_mat,row_var=False,as_matrix=False):
    """Returns column-wise descriptive statistics of a feature matrix.

    Parameters
    ----------
    feature_mat : NumPy array
        Matrix on which statistics are to be calculated. Assumed to be formatted
        so each row is an observation (a node, in the case of NetSimile).

    row_var : Boolean, optional (default=False)
        If true, then each variable has it's own row, and statistics are
        computed along rows rather than columns.

    as_matrix : Boolean, optional (default=False)
        If true, then description is returned as matrix. Otherwise, it is
        flattened into a vector.

    Returns
    -------
    description : NumPy array
        Descriptive statistics of feature_mat

    Notes
    -----

    References
    ----------
    """
    axis = int(row_var) 
    description = np.array([feature_mat.mean(axis=axis),
                            np.median(feature_mat,axis=axis),
                            np.std(feature_mat,axis=axis),
                            stats.skew(feature_mat,axis=axis),
                            stats.kurtosis(feature_mat,axis=axis)])
    if not as_matrix:
        description = description.flatten()
    return description

In [3]:
#directory where graph data of all the projects lies.
directory="path/to/folder"
# directory to 
newdir="path/to/folder/featurevecs/"
allfiles=listdir(directory)
filesrefined=[f for f in allfiles if f.startswith("pickle_graphs_all_randSample_noFork_6months")]

try:
    already_preprocessed_filelist=pickle.load(open(newdir+"already_preprocessed_weighted_filelist.p", "rb"))
except:
    already_preprocessed_filelist=[]
    pass

for file in filesrefined:
    if file not in already_preprocessed_filelist:
        print(file)
        
        networkl=[]
        with open(directory + file, "rb") as f:
            for graph in pickleLoader(f):
                networkl.append(graph)
                
        feature_vecs=dict({})
        for i,net in enumerate(networkl):
            try:
                feat = get_features(nx.adjacency_matrix(net)) 
                aggr_feat= aggregate_features(feat)
                feature_vecs[net.graph["id"]]=aggr_feat
            except:
                pass

        feature_vecs_3to7={}
        feature_vecs_8to20={}
        feature_vecs_21to50={}
        feature_vecs_over50={}
        for net in networkl:
            try:
                leng=len(net.nodes)
                key=net.graph["id"]
                if leng > 2 and leng < 8:
                    feature_vecs_3to7[key]=feature_vecs[key]
                elif leng >= 8 and leng < 21:
                    feature_vecs_8to20[key]=feature_vecs[key]
                elif leng >= 21 and leng < 51:
                    feature_vecs_21to50[key]=feature_vecs[key]
                elif leng>=50:
                    feature_vecs_over50[key]=feature_vecs[key]
            except:
                pass
        pickle.dump(feature_vecs_3to7,open(newdir + "feature_vecs_3to7.p", "ab"))
        pickle.dump(feature_vecs_8to20,open(newdir + "feature_vecs_8to20.p", "ab"))
        pickle.dump(feature_vecs_21to50,open(newdir + "feature_vecs_21to50.p", "ab"))
        pickle.dump(feature_vecs_over50,open(newdir + "feature_vecs_over50.p", "ab"))
        already_preprocessed_filelist.append(file)
        
        #free memory
        del networkl
        del feature_vecs_3to7
        del feature_vecs_8to20
        del feature_vecs_21to50
        del feature_vecs_over50
        del feature_vecs
        
pickle.dump(already_preprocessed_filelist,open(newdir+"already_preprocessed_weighted_filelist.p", "wb"))
    

pickle_graphs_all_randSample_noFork_6months_chunk0.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk1.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk10.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk11.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk12.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk13.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk14.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk15.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk16.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk17.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk18.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk19.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk2.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk20.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk21.gpickle
pickle_graphs_all_randSample_noFork_6months_chunk22.gpickle
pickle_graphs_all_randSample_noFork_6months