# Microsoft Academic Graph

## Preprocess MAG

### Libraries

In [1]:
## Basic
import numpy as np
from collections import defaultdict
import operator
import pandas as pd 
import random
import time


## Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Network Processing

## networkx
import networkx as nx
from networkx.generators import random_graphs
from networkx.generators import social
from networkx.generators import classic


## smallworld
from smallworld.draw import draw_network
from smallworld import get_smallworld_graph

## snap
color_map =["grey"]

## graph sampling
from sampling import ForestFire, Metropolis_Hastings, Random_Walk, Snowball, Ties, Base_Samplers

## Preprocess Files

In [2]:
f = open('data/mag/mag_raw.txt','r') 

L = list()
A = defaultdict(int) # default value of int is 0

line = f.readline()

n_list = list()

while line:
    
    if "#@" in line:
        n_list = line.replace("\n", "").split("#@")[1].split(",")
        
        ## edges
        if len(n_list) > 1: 
            for i in range(0, len(n_list)):
                for j in range(i+1, len(n_list)):
                        L.append((n_list[i], n_list[j]))                
        else:
            L.append((n_list[0], n_list[0]))
            
            
                        
    if "#citation" in line:
        
        c = line.split("#citation")[1]
        c = abs(int(c))
        
        for n in n_list:
            A[n] +=  c
                
            #print("n", n, "c", A[n], "\n")
    
    line = f.readline() ## read next line

f.close()

## pre-process and sort
L = set(L)

A_sorted = sorted(A.items(), key=operator.itemgetter(1), reverse=True)
A_sorted = [n_a[0] for n_a in A_sorted]

In [3]:
print(len(L))
print(list(A.keys())[:30])
print(A["Leslie Lamport"])
print(A_sorted[2])
print(len(A_sorted))

5366282
['José A. Blakeley', 'Yuri Breitbart', 'Hector Garcia-Molina', 'Abraham Silberschatz', 'Tom C. Reyes', 'Stavros Christodoulakis', 'Leonidas Koveos', 'Umeshwar Dayal', 'Eric N. Hanson', 'Jennifer Widom', 'Angelika Kotz Dittrich', 'Klaus R. Dittrich', 'Meichun Hsu', 'Nathan Goodman', 'Gail E. Kaiser', 'William Kelley', 'Sunit K. Gala', 'Won Kim', 'Bruce Graham', 'Alfons Kemper', 'Guido Moerkotte', 'Injun Choi', 'Mark Scheevel', 'Jorge F. Garza', 'Vincent J. Kowalski', 'David Krieger', 'Tim Andrews', 'Teresa F. Lunt', 'Weiyi Meng', 'Clement T. Yu']
38608
Anil K. Jain
1274360


### Numberize

In [4]:
Num = list(range(0, len(A_sorted)))
num_author = dict(zip(Num, A_sorted))
author_num = dict(zip(A_sorted, Num))

In [5]:
## Links
L_num = list()
Source_num = list()
Target_num = list()

for l in L:
    L_num.append((author_num[l[0]], author_num[l[1]]))
    Source_num.append(author_num[l[0]])
    Target_num.append(author_num[l[1]])

In [6]:
## Citations
C_num = list()

for num in Num:
    c = A[num_author[num]]
    C_num.append(c)
    
## Names
Names_num = list()

for num in Num:
    name = num_author[num]
    Names_num.append(name)

### Save as CSV

In [7]:
nodes = pd.DataFrame(
    {'Id': Num,
     'name': Names_num,
     'citations': C_num,
    })

edges = pd.DataFrame(
    {'Source': Source_num,
     'Target': Target_num,
    })

nodes.to_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/nodes_py.csv", index = None, header=False)
edges.to_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/edges_py.csv", index = None, header=False)

In [9]:
nodes.groupby('citations').count()

Unnamed: 0_level_0,Id,name
citations,Unnamed: 1_level_1,Unnamed: 2_level_1
0,178387,178387
1,291530,291530
2,89476,89476
3,56353,56353
4,43112,43112
5,34639,34639
6,29704,29704
7,25352,25352
8,22391,22391
9,20071,20071


## Sample Subset Fix

In [417]:
df_edges = pd.read_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/edges.csv")
df_nodes  = pd.read_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/nodes.csv")

edges = df_edges.values
nodes = df_nodes.values

#df_edges.columns.values
#df_nodes.columns.values

In [418]:
edges = [l.tolist() for l in list(edges)]
nodes = [l.tolist() for l in list(nodes)]

In [419]:
nodes = [i[0] for i in nodes]

In [420]:
Sample_nodes = random.sample(nodes, 200)
Sample_edges = list()

for n1,n2 in edges:
    if n1 in Sample_nodes and n2 in Sample_nodes:
        
        Sample_edges.append([n1,n2])

KeyboardInterrupt: 

In [None]:
#Sample_edges = random.sample(nodes, 200)
#Sample_nodes = list()

#Sample_edges_flattened = [val for sublist in Sample_edges for val in sublist]

#for num, name, c in nodes:
#    if num in Sample_edges_flattened:
        
#        Sample_nodes.append(num)
    

## Save Samples

In [88]:
## Citations
C_num = list()

for num in Sample_nodes:
    c = A[num_author[num]]
    C_num.append(c)
    
## Names
Names_num = list()

for num in Sample_nodes:
    name = num_author[num]
    Names_num.append(name)
    
    
## Links
L_num = list()
Source_num = list()
Target_num = list()

for l in Sample_edges:
    L_num.append((l[0],l[1]))
    Source_num.append(l[0])
    Target_num.append(l[1])

In [415]:
nodes = pd.DataFrame(
    {'Id': Sample_nodes,
     'name': Names_num,
     'citations': C_num,
    })

edges = pd.DataFrame(
    {'Source': Source_num,
     'Target': Target_num,
    })

nodes.to_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/sample_nodes.csv", index = None, header=True)
edges.to_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/sample_edges.csv", index = None, header=True)

ValueError: arrays must all be same length

## Sample with Biased Random Walk

In [421]:
g_complete = nx.read_edgelist("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/edges_py.csv",  nodetype=int, delimiter = ",")
#a_complete = nx.adjacency_matrix(g_complete)

df_nodes  = pd.read_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/nodes_py.csv", header = None)
nodes = df_nodes.values
nodes = [l.tolist() for l in list(nodes)]
nodes_num = [i[0] for i in nodes]

g.add_nodes_from(nodes_num)

In [422]:
print(len(nodes_num))
print(len(g_complete.edges()))

1274360
4939240


In [423]:
node_attr_dict = dict()

for num, name, c in nodes:
    node_attr_dict[num] = c
    
print(node_attr_dict[0])
    
nx.set_node_attributes(g_complete, node_attr_dict, "citations")

120649


In [424]:
print(len(g_complete.nodes()))
print(list(g_complete.nodes())[:10])

1274360
[107589, 1115655, 848931, 848932, 33910, 53000, 43667, 454, 1148682, 48812]


In [501]:
sampleArgs = {"sample": "biased_random_walk", "jump_bias": "random_walk_induced_graph_sampling", "n": 1000, "p": 20.0, "q": 100.0, "source_starts": 2, "source_returns": 4, "depth": 2}

##exact_n: forestfire, random_walk_induced_graph_sampling, random_walk_sampling_with_fly_back, adjacency, select
##approx_n: snowball, bfs, walk, jump

def get_graph(sampleArgs,g_complete,a_complete):
    
    if sampleArgs["sample"] == "biased_random_walk":
        sampler = Base_Samplers.Base_Samplers(g_complete,a_complete)
        #sampler = Base_Samplers(g_complete,a_complete)
        g = sampler.biased_random_walk(sampleArgs["n"], sampleArgs["p"], sampleArgs["q"])

    if sampleArgs["sample"] == "forestfire":
        sampler = ForestFire.ForestFire(g_complete,a_complete)
        g = sampler.forestfire(sampleArgs["n"])

    if sampleArgs["sample"] == "snowball":
        sampler = Snowball.Snowball(g_complete,a_complete)
        g = sampler.snowball(sampleArgs["source_starts"], sampleArgs["source_returns"])

    if sampleArgs["sample"] == "random_walk_induced_graph_sampling":
        sampler = Random_Walk.Random_Walk(g_complete,a_complete)
        g = sampler.random_walk_induced_graph_sampling(sampleArgs["n"])

    if sampleArgs["sample"] == "random_walk_sampling_with_fly_back":
        sampler = Random_Walk.Random_Walk(g_complete,a_complete)
        g = sampler.random_walk_sampling_with_fly_back(sampleArgs["n"], sampleArgs["p"])
        
    if sampleArgs["sample"] == "standard_bfs":
        sampler = Base_Samplers.Base_Samplers(g_complete,a_complete)
        g = sampler.standard_bfs(sampleArgs["source_starts"], sampleArgs["depth"]) 
        
    if sampleArgs["sample"] == "bfs":
        sampler = Base_Samplers.Base_Samplers(g_complete,a_complete)
        g = sampler.bfs(sampleArgs["n"]) 
        
    if sampleArgs["sample"] == "walk":
        sampler = Base_Samplers.Base_Samplers(g_complete,a_complete)
        g = sampler.walk(sampleArgs["source_starts"], sampleArgs["source_returns"], sampleArgs["p"])        
        
    if sampleArgs["sample"] == "jump":
        sampler = Base_Samplers.Base_Samplers(g_complete,a_complete)
        g = sampler.jump(sampleArgs["source_starts"], sampleArgs["p"], sampleArgs["jump_bias"])
        
    if sampleArgs["sample"] == "adjacency":
        sampler = Base_Samplers.Base_Samplers(g_complete,a_complete)
        g = sampler.adjacency(sampleArgs["n"]) 
        
    if sampleArgs["sample"] == "select":
        sampler = Base_Samplers.Base_Samplers(g_complete,a_complete)
        g = sampler.adjacency(sampleArgs["n"]) 
    
    return g 

start_time = time.time()
g = get_graph(sampleArgs, g_complete, a_complete)

print("-- n_max should be >=", len(g), "--")
print("-- function get_graph takes %s secs --" % round((time.time() - start_time),  5))

if len(g) <= 200:
    nx.draw(g, node_color = color_map, with_labels = False)

-- n_max should be >= 597 --
-- function get_graph takes 0.2005 secs --


In [495]:
Sample_nodes = list(g.nodes())
Sample_edges = list(g.edges())

print(len(Sample_edges))

1016


In [502]:
## Citations
C_num = list()

for num in Sample_nodes:
    c = A[num_author[num]]
    C_num.append(c)
    
## Names
Names_num = list()

for num in Sample_nodes:
    name = num_author[num]
    Names_num.append(name)
    
    
## Links
L_num = list()
Source_num = list()
Target_num = list()

for l in Sample_edges:
    L_num.append((l[0],l[1]))
    Source_num.append(l[0])
    Target_num.append(l[1])

In [503]:
nodes = pd.DataFrame(
    {'Id': Sample_nodes,
     'name': Names_num,
     'citations': C_num,
    })

edges = pd.DataFrame(
    {'Source': Source_num,
     'Target': Target_num,
    })

nodes.to_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/sample_nodes.csv", index = None, header=True)
edges.to_csv("/Users/niklasstoehr/Programming/thesis/4_real_attr/data/mag/sample_edges.csv", index = None, header=True)

### Analyze

In [345]:
#nx.degree_assortativity_coefficient(g_complete, x='out', y='in')

0.09273370760313358

In [410]:
from scipy.stats.stats import pearsonr

degrees = list(g_complete.degree())
D = list()
C = list()

for num, degree in degrees:
    c = A[num_author[num]]
    
    D.append(degree)
    C.append(c)

1274360 1274360


In [414]:
pearsonr(D, C)

(0.4662064646170404, 0.0)