In [1]:
import os
from github import Github
from dotenv import load_dotenv
import networkx as nx
import csv

# Create an empty graph
graph = nx.Graph()


In [2]:
# Open the CSV file and read the data for users
with open('dataset/users.csv', 'r') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if i == 0: # Skip the header columns
            continue

        # transform the string of repositories into an array of strings
        row[2] = [x for x in row[2].replace(' ', '').replace('[', '').replace(']', '').replace('\'', '').split(',')]

        # Add the node with data from the first column
        graph.add_node(row[0], name=row[1], repos=row[2], color="#7FB2FD")

In [3]:
# Open the CSV file and read the data for repositories
with open('dataset/repos.csv', 'r') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if i == 0: # Skip the header columns
            continue

        # transform the string of languages into an array of strings
        row[2] = [x for x in row[2].replace(' ', '').replace('[', '').replace(']', '').replace('\'', '').split(',')]

        """ print("row", i, ": ", row)
        print("row[0] (repository ID): ", row[0])
        print("row[1] (repository name): ", row[1])
        print("row[2] (languages): ", row[2]) """

        # Add the node with data from the first column
        graph.add_node(row[0], name=row[1], languages=row[2], color="#FF8CCD")


In [4]:
# Add an edge between users and repositories
for node in graph.nodes:
    # if node starts with "U"
    if node[0] == "u":
        # for each repository in the user's list of repositories
        for repo in graph.nodes[node]['repos']:
            # add an edge between the user and the repository
            graph.add_edge(node, repo)

In [5]:
# Print the number of nodes and edges
print("Number of nodes:", graph.number_of_nodes())
print("Number of edges:", graph.number_of_edges())

# Print the neighbors of a node
# print("Neighbors of node 1:", list(G.neighbors(1)))

Number of nodes: 4432
Number of edges: 6375


In [6]:
# Draw the graph using color property to distinguish users and repositories
#nx.draw(graph, with_labels=True, node_color=[graph.nodes[node]['color'] for node in graph.nodes], node_size=1000)

In [7]:
# Calculate the degree centrality of all nodes
degree_centrality = nx.degree_centrality(graph)

# divide the degree centrality 
users_degree_centrality = { k: v for k, v in degree_centrality.items() if k.startswith('u_')}
repos_degree_centrality = { k: v for k, v in degree_centrality.items() if k.startswith('r_')}

# Print the degree centrality of first 10 users in descending order
print("\nDegree centrality of first 10 nodes in descending order:")
for node in sorted(users_degree_centrality, key=users_degree_centrality.get, reverse=True)[:10]:
    print(node, users_degree_centrality[node])

# show mean degree centrality for users
print("\nMean degree centrality for users:", sum(users_degree_centrality.values())/len(users_degree_centrality))

# Print the degree centrality of first 10 repositories in descending order
print("\nDegree centrality of first 10 nodes in descending order:")
for node in sorted(repos_degree_centrality, key=repos_degree_centrality.get, reverse=True)[:10]:
    print(node, repos_degree_centrality[node])

# show mean degree centrality for repositories
print("\nMean degree centrality for repositories:", sum(repos_degree_centrality.values())/len(repos_degree_centrality))





Degree centrality of first 10 nodes in descending order:
u_1710 0.22432859399684044
u_1387 0.22297449785601445
u_2712 0.009027307605506657
u_776 0.006093432633716994
u_1205 0.004062288422477996
u_1461 0.0038366057323403293
u_2125 0.003610923042202663
u_2180 0.003610923042202663
u_3060 0.002933874971789664
u_3222 0.002933874971789664

Mean degree centrality for users: 0.0004184779376462035

Degree centrality of first 10 nodes in descending order:
r_667 0.07086436470322727
r_963 0.05055292259083728
r_126 0.028887384337621305
r_216 0.028887384337621305
r_76 0.023696682464454975
r_2 0.02279395170390431
r_3 0.020537124802527645
r_225 0.019183028661701646
r_969 0.018731663281426315
r_115 0.01828029790115098

Mean degree centrality for repositories: 0.001447411619343704


In [8]:
# Calculate the betweenness centrality of all nodes
betweenness_centrality = nx.betweenness_centrality(graph)

# Calculate the betweenness centrality of users nodes
users_betweenness_centrality = { k: v for k, v in betweenness_centrality.items() if k.startswith('u_')}


In [9]:

# Print the betweenness centrality of each node
for node, centrality in sorted(users_betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:10]:
    print("Node", node, "has betweenness centrality", centrality)

# show mean betweenness centrality for users
print("\nMean betweenness centrality for users:", sum(users_betweenness_centrality.values())/len(users_betweenness_centrality))


Node u_1710 has betweenness centrality 0.4703289313283779
Node u_1387 has betweenness centrality 0.4398901534290559
Node u_2712 has betweenness centrality 0.009618500765823849
Node u_2125 has betweenness centrality 0.004504485559402676
Node u_2180 has betweenness centrality 0.003881150205018476
Node u_3174 has betweenness centrality 0.0030214588822351637
Node u_957 has betweenness centrality 0.0025193007937907705
Node u_1207 has betweenness centrality 0.0024694943601352936
Node u_776 has betweenness centrality 0.002030059550218193
Node u_28 has betweenness centrality 0.001999912391358061

Mean betweenness centrality for users: 0.00028614468189174115


In [10]:
# Calculate the eigenvector centrality of all nodes
eigenvector_centrality = nx.eigenvector_centrality(graph, max_iter=1000)

# Calculate the eigenvector centrality of repos nodes
repos_eigenvector_centrality = { k: v for k, v in eigenvector_centrality.items() if k.startswith('r_')}

# Calculate the eigenvector centrality of users nodes
users_eigenvector_centrality = { k: v for k, v in eigenvector_centrality.items() if k.startswith('u_')}

# Print the eigenvector centrality of repos nodes
print("\nEigenvector centrality of first 10 nodes in descending order:")
for node, centrality in sorted(repos_eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:10]:
    print("Node", node, "has eigenvector centrality", centrality)

# Print the eigenvector centrality of users nodes
print("\nEigenvector centrality of first 10 nodes in descending order:")
for node, centrality in sorted(users_eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:10]:
    print("Node", node, "has eigenvector centrality", centrality)


Eigenvector centrality of first 10 nodes in descending order:
Node r_667 has eigenvector centrality 0.02807241600112121
Node r_963 has eigenvector centrality 0.0252872002485827
Node r_126 has eigenvector centrality 0.02508444065051212
Node r_204 has eigenvector centrality 0.024918586221542628
Node r_969 has eigenvector centrality 0.02484594166364357
Node r_3 has eigenvector centrality 0.024744005513770605
Node r_41 has eigenvector centrality 0.024621133427994935
Node r_252 has eigenvector centrality 0.02459630822116519
Node r_0 has eigenvector centrality 0.02452737750133048
Node r_115 has eigenvector centrality 0.024420314891656622

Eigenvector centrality of first 10 nodes in descending order:
Node u_1710 has eigenvector centrality 0.49917648037912005
Node u_1387 has eigenvector centrality 0.4976560737491123
Node u_2712 has eigenvector centrality 0.021178422969835194
Node u_776 has eigenvector centrality 0.01404028576450716
Node u_1205 has eigenvector centrality 0.009409539530923217
N

In [11]:
# Calculate the clustering coefficient of repos nodes by languages of repositories
repos_clustering_coefficient = {}
for node in repos_eigenvector_centrality:
    # get the languages of the repository
    languages = graph.nodes[node]['languages']
    # for each language
    for language in languages:
        # if the language is not in the dictionary
        if language not in repos_clustering_coefficient:
            # add the language as a key and the eigenvector centrality as a value
            repos_clustering_coefficient[language] = [repos_eigenvector_centrality[node]]
        # if the language is already in the dictionary
        else:
            # append the eigenvector centrality to the list of values
            repos_clustering_coefficient[language].append(repos_eigenvector_centrality[node])

# Print the clustering coefficient of repos nodes by languages of repositories
print("\nClustering coefficient of repos nodes by languages of repositories:")
for language, centrality in sorted(repos_clustering_coefficient.items(), key=lambda x: sum(x[1])/len(x[1]), reverse=True)[:10]:
    print("Language", language, "has clustering coefficient", sum(centrality)/len(centrality))




Clustering coefficient of repos nodes by languages of repositories:
Language Logos has clustering coefficient 0.024420314891656622
Language Slim has clustering coefficient 0.024420314891656622
Language RichTextFormat has clustering coefficient 0.02439207166182593
Language CMake has clustering coefficient 0.02430512933764281
Language M4 has clustering coefficient 0.024218187013459694
Language Ada has clustering coefficient 0.024218187013459694
Language Pascal has clustering coefficient 0.024218187013459694
Language DIGITALCommandLanguage has clustering coefficient 0.024218187013459694
Language DTrace has clustering coefficient 0.024218187013459694
Language CLIPS has clustering coefficient 0.024218187013459694


In [12]:
# Calculate homophily of nodes
# Create an example graph
G = nx.Graph()
G.add_edges_from([(1, 2), (1, 3), (2, 3), (3, 4)])

# Create sets for users and repos
users = set([1, 2, 3])
repos = set([4])

# Add attributes to the nodes
for node in G.nodes():
    if node in users:
        G.nodes[node]['user'] = 'user' + str(node)
        G.nodes[node]['repos'] = ['repo' + str(node) + '_1', 'repo' + str(node) + '_2']
    if node in repos:
        G.nodes[node]['name'] = 'repo' + str(node)
        G.nodes[node]['languages'] = ['python', 'javascript']

# Compute Jaccard similarity for each edge between users
for u, v, d in G.edges(data=True):
    if G.nodes[u]['user'] is not None and G.nodes[v]['user'] is not None:
        languages_u = set(G.nodes[u]['languages'])
        languages_v = set(G.nodes[v]['languages'])
        d['similarity'] = len(languages_u.intersection(languages_v)) / len(languages_u.union(languages_v))

# Print the similarity values for each edge
print(nx.get_edge_attributes(G, 'similarity'))


KeyError: 'languages'

In [None]:
# Calculate structural equivalence of nodes
# Create an example graph
G = nx.Graph()
G.add_edges_from([(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)])

# Create sets for users and repos
users = set([1, 2, 3])
repos = set([4,5,6])

# Add edges between users and repos
G.add_edges_from([(1, 4), (1, 5), (2, 5), (2, 6), (3, 6)])

# Compute structural equivalence for each pair of users
for u in users:
    for v in users:
        if u < v:
            repos_u = set(G.neighbors(u)) & repos
            repos_v = set(G.neighbors(v)) & repos
            sim = len(repos_u.intersection(repos_v)) / len(repos_u.union(repos_v))
            print("Structural equivalence between user {} and user {}: {}".format(u, v, sim))


AttributeError: module 'networkx' has no attribute 'info'