In [1]:
import gc
import pickle
import time

##--get graph map
def get_graph_info(gf_file):
    '''
    get graph information as a map
    key: user id, value: list of friend id
    '''
    gf_map = {}
    with open(gf_file, 'r') as cf:
        for line in cf:
            if line[0] != '#':
                node_list = line.split('\t')
                if len(node_list) == 2:
                    node_list = [int(id) for id in node_list]
                    if node_list[0] in gf_map:
                        gf_map[node_list[0]].add(node_list[1])
                    else:
                        gf_map[node_list[0]] = set([node_list[1]])
                    if node_list[1] in gf_map:
                        gf_map[node_list[1]].add(node_list[0])
                    else:
                        gf_map[node_list[1]] = set([node_list[0]])                    
                else:
                    print "wrong format line: ", line
    return gf_map

def get_comm_info(comm_file):
    '''
    get community information, two maps
    map1: key: user id, value: community id array
    map2: key: community id, value: user id array
    '''
    comm_map_usr = {}
    comm_map_comm = {}
    comm_id = 0
    with open(comm_file, 'r') as cf:
        for line in cf:
            node_list = line.split('\t')
            node_list = [int(id) for id in node_list]
            for id in node_list:
                if id in comm_map_usr:
                    comm_map_usr[id].append(comm_id)
                else:
                    comm_map_usr[id] = [comm_id]
            comm_map_comm[comm_id] = node_list[:]
            comm_id += 1
    return comm_map_usr, comm_map_comm


In [2]:
#t0=time.time()
#gf_file = "../data/com-lj.ungraph.txt"
#gf_map = get_graph_info(gf_file)
#gc.collect()
#print "Get graph map"
#gf_file = "whole_graph_map.pkl"
#with open(gf_file, 'wb') as fl:
#    pickle.dump(gf_map, fl)
#print "Dump the whole graph map to pickle"
#gc.collect()


In [3]:
import time
t0=time.time()
gf_file = "whole_graph_map.pkl"
file = open(gf_file,'rb')
gf_map = pickle.load(file)
file.close()

print "Time Elapsed:", "{0:02f}".format(time.time()-t0)

Time Elapsed: 178.807495


In [4]:
##--get community information
comm_file = '../data/com-lj.all.cmty.txt'
comm_map_usr, comm_map_comm = get_comm_info(comm_file)

usr_to_com, com_to_usr = comm_map_usr, comm_map_comm 
del gf_map

In [5]:
import snap
import numpy as np
import pandas as pd
import random
import copy
import time
random.seed(2016)
import scipy.stats
t0=time.time()

G= snap.LoadEdgeList(snap.PUNGraph, "../data/com-lj.ungraph.txt", 0, 1)
print "Time Elapsed: ", time.time()-t0, "second"
print "Number of Nodes: ", G.GetNodes()
print "Number of Edges: ", G.GetEdges()



Time Elapsed:  21.0892910957 second
Number of Nodes:  3997962
Number of Edges:  34681189


In [6]:
nodes = [node.GetId() for node in G.Nodes()]
print len(nodes)

3997962


In [7]:
edges = [edge.GetId() for edge in G.Edges()]
print len(edges)

34681189


In [8]:
usr_in_some_comms = set(usr_to_com.keys())
print "Number of users that are at least in one community: ", len(usr_in_some_comms)

Number of users that are at least in one community:  1147948


# Use sparse matrix to store the number of edges between two communities

## Results: 10^7 edges done after 13 hours, memory used reach 20GB. 
## All of the 3*10^7 edges done after 21 hours, memory used reach 25GB.

In [9]:
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix

In [10]:
ncomms = len(com_to_usr)

A = lil_matrix((ncomms, ncomms), dtype=np.int32)
t0 = time.time()

count = 0 # count the number of edges done
k=1 # print when 10,10^2, 10^3, 10^4, 10^5, 10^6, 10^7... edges done

for edge in edges:
    
    if count==10**k:
        print count, "Edges done, Time Elapsed:", "{0:.2f}".format(time.time()-t0), "secs"
        k+=1

    #if count >10**3+1:
    #    break
        
    (usr1,usr2) = edge
    
    if not usr1 in usr_in_some_comms:  # skip if usr1 or usr2 does not belong to any communities
        count+=1
        continue
    if not usr2 in usr_in_some_comms:
        count+=1
        continue
    
    for comm1 in usr_to_com[usr1]:
        for comm2 in usr_to_com[usr2]:
            comm1, comm2 = min(comm1,comm2), max(comm1,comm2)
            A[comm1,comm2]+=1
    
    count+=1


print "{0:.2f}".format(time.time()-t0), "secs"

10 Edges done, Time Elapsed: 0.00 secs
100 Edges done, Time Elapsed: 0.18 secs
1000 Edges done, Time Elapsed: 0.46 secs
10000 Edges done, Time Elapsed: 147.73 secs
100000 Edges done, Time Elapsed: 1052.12 secs
1000000 Edges done, Time Elapsed: 5160.19 secs
10000000 Edges done, Time Elapsed: 47620.48 secs
75082.22 secs


In [13]:
A

<664414x664414 sparse matrix of type '<type 'numpy.int32'>'
	with 270236207 stored elements in LInked List format>

In [14]:
# Update A to get a symmetric matrix.   (skip, will take more than 2 hrs and consume memory)

#for row in range(A.shape[0]):
#    A[row:,row] = A[row,row:].transpose()

In [15]:
t0 = time.time()
A = A.tocsr()
#convert to CSR or CSC format for fast arithmetic and matrix vector operations
A
print "{0:.2f}".format(time.time()-t0), "secs"

58.62 secs


In [16]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [17]:
save_sparse_csr("sparse_matrix",A)

In [None]:
B=load_sparse_csr("sparse_matrix.npz")