# Constructed Graph

In [1]:
from tqdm import tqdm
import networkx as nx
# from networkx.algorithms.community.centrality import girvan_newman
import pandas as pd 
from itertools import islice
# Create a graph from text file 
# user_id, user_id


In [2]:
df = pd.read_csv('../data/ub_sample_data.csv')
# random_ids = df.user_id.sample(100).unique()
# df = df[df.user_id.isin(random_ids)]

In [3]:
from graph import UndirectedGraph
G = UndirectedGraph()

In [4]:
usr_ids = df.user_id.unique()
business_dict = {}
for u in tqdm(usr_ids):
    business_u = set(df[df.user_id == u].business_id.unique())
    business_dict[u] = business_u

100%|██████████| 3374/3374 [00:17<00:00, 189.05it/s]


In [5]:
usr_ids = df.user_id.unique()
edge_threshold = 7
for u in tqdm(usr_ids):
    for v in tqdm(usr_ids, leave=False):
        if u != v:
            business_u = business_dict[u]
            business_v = business_dict[v]
            if len(business_u.intersection(business_v)) >= edge_threshold:
                G.add_edge(u, v)
        else:
            # if u == v, business cannot be reviewed by the same user
            # G.add_node(u)
            pass

100%|██████████| 3374/3374 [00:49<00:00, 68.28it/s]


In [6]:
len(G.nodes), len(G.edges)

(222, 498)

# Caculate betweeness centrality of original graph

In [7]:
from measure import edge_betweenness_centrality 
result = edge_betweenness_centrality(G)
result = sorted(result.items(), key=lambda x: x[1], reverse=True)
def sort_by_lexicographical_order(edge):
    return tuple(sorted(edge))
result = [(sort_by_lexicographical_order(edge[0]), edge[1]) for edge in result]
result[:5]

[(('cyuDrrG5eEK-TZI867MUPA', 'l-1cva9rA8_ugLrtSdKAqA'), 0.17259793730381964),
 (('1st2ltGKJ00ZcRsev-Ieew', 'DKolrsBSwMTpTJL22dqJRQ'), 0.1625081448610863),
 (('1st2ltGKJ00ZcRsev-Ieew', 'HLY9oDcVBH9D25lU4X_V5Q'), 0.15722962781786312),
 (('1st2ltGKJ00ZcRsev-Ieew', 'Hv_q_ZnSIoZwdcoH0CyV2Q'), 0.15180791651379885),
 (('1st2ltGKJ00ZcRsev-Ieew', 'JM0GL6Dx4EuZ1mprLk5Gyg'), 0.14958675546910816)]

In [8]:
with open('edge_betweenness_centrality.txt', 'w') as f:
    for item in result:
        f.write(f'({item[0][0]},{item[0][1]}),{item[1]}')
        f.write('\n')
    f.close()
!head edge_betweenness_centrality.txt

(cyuDrrG5eEK-TZI867MUPA,l-1cva9rA8_ugLrtSdKAqA),0.17259793730381964
(1st2ltGKJ00ZcRsev-Ieew,DKolrsBSwMTpTJL22dqJRQ),0.1625081448610863
(1st2ltGKJ00ZcRsev-Ieew,HLY9oDcVBH9D25lU4X_V5Q),0.15722962781786312
(1st2ltGKJ00ZcRsev-Ieew,Hv_q_ZnSIoZwdcoH0CyV2Q),0.15180791651379885
(1st2ltGKJ00ZcRsev-Ieew,JM0GL6Dx4EuZ1mprLk5Gyg),0.14958675546910816
(HLY9oDcVBH9D25lU4X_V5Q,l-1cva9rA8_ugLrtSdKAqA),0.1359504300680771
(Hv_q_ZnSIoZwdcoH0CyV2Q,l-1cva9rA8_ugLrtSdKAqA),0.13407525172231055
(0FVcoJko1kfZCrJRfssfIA,DKolrsBSwMTpTJL22dqJRQ),0.09035984963113673
(a48HhwcmjFLApZhiax41IA,o-t-i7nbT5N_cmkCXs5oDQ),0.08026578614813909
(A-U-K9z9oraMH7eBZW1dOA,l-1cva9rA8_ugLrtSdKAqA),0.07737148913619502


# Community Detection

In [9]:
from modularity import modularity
from girvan_newman import girvan_newman

In [10]:
import time 

In [11]:
import signal 
import multiprocessing


def handler(signum, frame): 
    raise IOError("Timeout") 

G_clone = G.copy()
comp = girvan_newman(G_clone)
iter = 0
max_modularity = -1 
max_modularity_communities = None

signal.signal(signal.SIGALRM, handler) 
signal.alarm(30) 

while True:
    timeout = 60
    iter += 1
    start = time.time()
    try:
        communities = next(comp)
    except:
        print('no more communities')
        break

    modularityScore = modularity(G_clone, communities)
    if modularityScore > max_modularity:
        max_modularity = modularityScore
        max_modularity_communities = communities
    print(f'iter: {iter}, modularity: {modularityScore}, time: {time.time() - start}')

iter: 1, modularity: 0.5287172787535688, time: 0.7249999046325684
iter: 2, modularity: 0.6449835486524415, time: 0.8725414276123047
iter: 3, modularity: 0.6711625618941633, time: 0.09029436111450195
iter: 4, modularity: 0.6768358574861699, time: 0.48787713050842285
iter: 5, modularity: 0.7144562184480896, time: 0.08021020889282227
iter: 6, modularity: 0.7254076547152466, time: 0.17882871627807617
iter: 7, modularity: 0.7344962178029387, time: 0.7171242237091064
iter: 8, modularity: 0.7428569700488702, time: 0.16547012329101562
iter: 9, modularity: 0.751687472782697, time: 0.2574169635772705
iter: 10, modularity: 0.7530422896404896, time: 0.10512614250183105
iter: 11, modularity: 0.8010253866873118, time: 1.1608047485351562
iter: 12, modularity: 0.8042410606280548, time: 0.1859278678894043
iter: 13, modularity: 0.8048035515556204, time: 0.18442344665527344
iter: 14, modularity: 0.8054648312124002, time: 0.046543121337890625
iter: 15, modularity: 0.8060253060434512, time: 0.0459284782409

In [12]:
max_modularity

0.9865445718617429

In [13]:
result = sorted(max_modularity_communities, key=lambda x: len(x), reverse=False)
result[0]

{'39FT2Ui8KUXwmUt6hnwy-g'}

: 

In [1]:
len(result)

NameError: name 'result' is not defined

In [16]:
with open('girvan_newman.txt', 'w') as f:
    for item in result:
        f.write(', '.join(sorted(list(item))))
        f.write('\n')
    f.close()
!head girvan_newman.txt

39FT2Ui8KUXwmUt6hnwy-g
0FVcoJko1kfZCrJRfssfIA
JM0GL6Dx4EuZ1mprLk5Gyg
bSUS0YcvS7UelmHvCzNWBA
DKolrsBSwMTpTJL22dqJRQ
sdLns7062kz3Ur_b8wgeYw
_VTEyUzzH92X3w-IpGaXVA
qtOCfMTrozmUSHWIcohc6Q
zBi_JWB5uUdVuz3JLoAxGQ
KLB3wIYUwKDPMbijIE92vg


In [17]:
!tail girvan_newman.txt

ZZvfGGLnAkSBSUduV7KN-w
ORJnGXXkS9tQBTNyPQJF9A
SVC0CajvmYfH5uAq4JnGvg
LgFDWZTLi1w9OGi5BtKORg
4ONcRRisDZkbV1cviA7nFw
KBoIRjxSW7OWczv8OS9Bew
gH0dJQhyKUOVCKQA6sqAnw
453V8MlGr8y61PpsDAFjKQ
YVQFzWm0H72mLUh-8gzd5w
46HhzhpBfTdTSB5ceTx_Og
