In [4]:
import networkx as nx
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import fileDefinitions as fd
import utils.helper_methods as helper_methods
from networkx.algorithms import bipartite
import time


In [5]:


try:
    gph = nx.read_gexf(fd.fullyConnGraphFile)
    helper_methods.logData("graph file found at " + str())
    print(gph)
except Exception as e:
    helper_methods.logData(e)
    gph = nx.Graph()
    helper_methods.logData("new graph file generated")


graph file found at 
Graph with 11406 nodes and 1391375 edges


In [6]:

B = gph
# users = []
# issues = []
# for n in B:
#     if(n == 0):
#         users.append(n)
#     else:
#         issues.append(n)
users = [n for n, d in B.nodes(data=True) if d["bipartite"] == 0]
issues = [n for n, d in B.nodes(data=True) if d["bipartite"] == 1]


In [7]:


# create adjacency matrices for the users and issues
users_matrix = nx.bipartite.biadjacency_matrix(B, row_order=users)
issues_matrix = nx.bipartite.biadjacency_matrix(B, row_order=issues)




In [8]:
print(issues_matrix[:,[1]])

  (4731, 0)	1


In [9]:


def jaccard_coefficient(graph, node1, node2):
    neighbors1 = set(graph.neighbors(node1))
    neighbors2 = set(graph.neighbors(node2))
    intersection = neighbors1.intersection(neighbors2)
    union = neighbors1.union(neighbors2)
    if len(union) == 0:
        return 0
    return len(intersection) / len(union)

In [10]:


def jaccard_link_prediction(graph):
    predicted_edges = []
    count = 10000
    userCount = 0
    for node1 in users:
        
        userCount += 1
        if(userCount%100 == 0):
            print(userCount)
        for node2 in issues:
            
            if node1 == node2 or graph.has_edge(node1, node2):
                continue
            
            jaccard = jaccard_coefficient(graph, node1, node2)
            if(jaccard != 0):
                predicted_edges.append((node1, node2, jaccard))
                count -= 1
                if(count == 0):
                    print(node1,end="-------")
                    print(node2, end="-------")
                    print(jaccard)
                    count = 10000
                    
    return sorted(predicted_edges, key=lambda x: x[2], reverse=True)


In [11]:
print(B)
print(len(users))
print(len(issues))


Graph with 11406 nodes and 1391375 edges
3126
8280


In [13]:
import random
def TestJaccordLinkPred(g):
    B = g
    users = [n for n, d in B.nodes(data=True) if d["bipartite"] == 0]
    issues = [n for n, d in B.nodes(data=True) if d["bipartite"] == 1]
    
    ogGph = nx.Graph()

    issues_subset = random.sample(issues, 1000)

    print(B.number_of_edges())
    for i in issues_subset:
        for j in list(B.neighbors(i)):
        
            if B.nodes[j]["bipartite"] == 0:
                ogGph.add_node(i, bipartite=1)
                ogGph.add_node(j, bipartite=0)
                ogGph.add_edge(i,j)
                B.remove_edge(i,j)
    print(B.number_of_edges())
    return ogGph,B


testGraph, trainGraph = TestJaccordLinkPred(gph)

predicted = jaccard_link_prediction(trainGraph)

predicted



1391375
1390375
stuartmorgan-------PR_kwDODIpBAs5J50MO-------0.0007017543859649122
cpswan-------MDExOlB1bGxSZXF1ZXN0MjQyNDcyNzc0-------0.0006711409395973154
pavanpodila-------PR_kwDODYvhJM4xLrO7-------0.001277139208173691
apomalyn-------PR_kwDODIpBAs5Ctfxe-------0.0006131207847946045
songyanghe666-------PR_kwDOB40ng85Jhx3r-------0.0007012622720897616
login-------PR_kwDOA7lyys5KR1R8-------0.0027397260273972603
Jemair-------MDExOlB1bGxSZXF1ZXN0Mzg3OTE0OTcy-------0.001989389920424403
OpenFlutter-------PR_kwDOB40ng85EM1B5-------0.005614035087719298
liyuqian-------PR_kwDOA7lyys5IlPPw-------0.0007002801120448179
michael120893-------PR_kwDOCkCuKM41vTLU-------0.0014035087719298245
jw-koo-------MDExOlB1bGxSZXF1ZXN0MjY0MDM4MDI5-------0.0006939625260235947
VasuGajjar-------MDExOlB1bGxSZXF1ZXN0MTk5NjAwNTA=-------0.0007017543859649122
100
Mohit-Joshi-dev-------PR_kwDOCamu-c5IsfYd-------0.000700770847932726
RohanSengupta326-------MDExOlB1bGxSZXF1ZXN0NTY0NDY4NDM1-------0.0006997900629811056
mohammadr

KeyboardInterrupt: 

In [None]:
# preferential attachment 

def prefAttachment(graph):
    predicted_edges = []
    count = 10000
    userCount = 0
    for node1 in users:
        userCount += 1
        if(userCount%100 == 0):
            print(userCount)
        for node2 in issues:
            if node1 == node2 or graph.has_edge(node1, node2):
                continue
            score = nx.preferential_attachment(graph, (node1,node2))
            for u,v, p in score:
                print(f"({u}, {v}) -> {p}")
            # if(score != 0):
            #     predicted_edges.append((node1, node2, score))
            #     count -= 1
            #     if(count == 0):
            #         print(node1,end="-------")
            #         print(node2, end="-------")
            #         print(score)
                    # count = 10000
                    
    return sorted(predicted_edges, key=lambda x: x[2], reverse=True)


In [36]:
prefAttachment(B)

ValueError: too many values to unpack (expected 2)

In [None]:

def adamic_adar_coefficient(graph, node1, node2):
    neighbors1 = set(graph.neighbors(node1))
    neighbors2 = set(graph.neighbors(node2))
    common_neighbors = neighbors1.intersection(neighbors2)
    aa = sum([1 / math.log(len(set(graph.neighbors(neighbor)))) for neighbor in common_neighbors])
    return aa

def adamic_adar_link_prediction(graph):
    predicted_edges = []
    for node1 in graph.nodes():
        for node2 in graph.nodes():
            if node1 == node2 or graph.has_edge(node1, node2):
                continue
            aa = adamic_adar_coefficient(graph, node1, node2)
            predicted_edges.append((node1, node2, aa))
    return sorted(predicted_edges, key=lambda x: x[2], reverse=True)

In [None]:

# compute the Adamic-Adar similarity between all pairs of users and issues
aa_matrix = np.zeros((len(users), len(issues)))
for u, i in combinations(range(len(users)), 2):
    u_vec = users_matrix[u].toarray()[0]
    i_vec = issues_matrix[i].toarray()[0]
    common = np.where(np.logical_and(u_vec, i_vec))[0]
    aa = 0
    for c in common:
        aa += 1 / np.log(len(np.where(users_matrix[:, c].toarray())[0]))
    aa_matrix[u][i] = aa
    aa_matrix[i][u] = aa
