In [1]:
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns

# Preprocessing

In [2]:
data_path = 'Datasets/email.csv'
graph_path = 'nxGraph/graph_email'

In [3]:
df_edge = pd.read_csv(data_path, sep=',', names=['source', 'target'])

In [4]:
df_edge.head()

Unnamed: 0,source,target
0,0,0
1,0,1
2,0,101
3,0,103
4,0,146


In [5]:
# 去掉自环
df_edge = df_edge[df_edge['source'] != df_edge['target']]

In [6]:
def df2graph(df_edge, directed=False):
    """convert DataFrame to Graph(networkx)"""
    node1 = df_edge['source'].values.tolist()
    node2 = df_edge['target'].values.tolist()
    
    if directed == True:
        G = nx.DiGraph()
        G.add_edges_from(zip(node1, node2))
    else:
        G = nx.Graph()
        G.add_edges_from(zip(node1, node2))
    
    return G

In [7]:
di_graph = df2graph(df_edge, True)

In [8]:
pr = nx.pagerank(di_graph)
df_node = pd.DataFrame(pr.items(), columns=['id', 'pagerank'])

In [9]:
nodes = list(df_node['id'])

In [10]:
in_deg = list(map(lambda x: len(list(di_graph.predecessors(x))), nodes))
out_deg = list(map(lambda x: len(list(di_graph.successors(x))), nodes))

In [11]:
df_node['in_deg'] = pd.DataFrame(in_deg)
df_node['out_deg'] = pd.DataFrame(out_deg)

Unnamed: 0,id,pagerank,in_deg,out_deg


In [12]:
df_node.head()

Unnamed: 0,id,pagerank,in_deg,out_deg
0,0,0.001389,31,40
1,1,0.001671,50,0
2,101,0.000948,24,20
3,103,0.00162,42,32
4,146,0.000595,13,26


In [24]:
df_node[df_node['out_deg'] > 100]

Unnamed: 0,id,pagerank,in_deg,out_deg
6,166,0.004097,126,124
7,17,0.002107,60,105
22,283,0.003647,119,117
28,377,0.003775,87,130
32,5,0.005141,123,155
35,6,0.003233,92,108
36,64,0.004695,135,112
42,105,0.003767,114,118
44,121,0.005252,156,221
45,13,0.002408,60,171


In [13]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [14]:
# 为边赋予权重
def weighted_graph(graph, df_node, alpha=0.7):
    weighted_graph = nx.DiGraph()
    for edge in graph.edges:
        source, target = edge
        source_out = int(df_node[df_node['id']==source]['out_deg'])
        target_in = int(df_node[df_node['id']==target]['in_deg'])
        weight = min(alpha, np.log(source_out)/np.log(target_in*source_out))
        weighted_graph.add_edge(source, target, weight=weight)
    return weighted_graph

In [15]:
w_graph = weighted_graph(di_graph, df_node)

  weight = min(alpha, np.log(source_out)/np.log(target_in*source_out))


In [16]:
prob_matrix = nx.adjacency_matrix(w_graph).todense().astype(np.float16)

In [17]:
di_graph = nx.DiGraph(prob_matrix, weight=prob_matrix)

In [18]:
with open(graph_path, 'wb') as f:
    pickle.dump(di_graph, f)