In [1]:
import pandas as pd
import networkx as nx
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
#from pprint import pprint
import seaborn as sns
import numpy as np
from collections import Counter

#import graph_tool
#from graph_tool import draw
#from graph_tool import inference

In [2]:
#!pip install --user networkx
#!pip install graph_tool

In [3]:
edges = pd.read_csv("../data/musae_git_edges.csv")
metadata = pd.read_csv("../data/musae_git_target.csv")

In [4]:
metadata.head()

Unnamed: 0,id,name,ml_target
0,0,Eiryyy,0
1,1,shawflying,0
2,2,JpMCarrilho,1
3,3,SuhwanCha,0
4,4,sunilangadi2,1


In [5]:
edges.head()

Unnamed: 0,id_1,id_2
0,0,23977
1,1,34526
2,1,2370
3,1,14683
4,1,29982


See number or ML developers vs Web Developers in the dataset

In [6]:
metadata.groupby("ml_target").size()

ml_target
0    27961
1     9739
dtype: int64

In [7]:
#each connection only appers once
edges.groupby(['id_1', 'id_2']).size().reset_index(name='count').sort_values(by='count', ascending=False).head()

Unnamed: 0,id_1,id_2,count
0,0,23977,1
192665,19222,21321,1
192671,19222,21399,1
192670,19222,21381,1
192669,19222,21375,1


In [None]:
connections = edges.groupby('id_1')['id_2'].count()
sorted_connections = connections.sort_values(ascending=False)

median_connections = connections.median()
print('Median GitHub Friend Count:', median_connections)


plt.hist(connections, bins=30, range=(0, 100))
plt.xlabel('Number of Followings')
plt.ylabel('Frequency')
plt.title('Histogram of Number of Connections')
plt.savefig("plots/hist_nr_connections")
plt.show()


Median GitHub Friend Count: 3.0


In [None]:
#%matplotlib notebook

In [None]:
# Explore the outliers, there are accounts with 1000 followings and more
plt.boxplot(connections, showfliers=True)
plt.ylabel('Number of Connections')
plt.title('Boxplot of Number of Connections with Outliers')
plt.savefig("plots/box_nr_connections")
plt.show()

In [None]:


# Merge the datasets based on the common identifier
merged_df = pd.merge(edges, metadata, left_on='id_1', right_on='id')

# Calculate the number of connections for each user account
connections = merged_df.groupby('id')['id_2'].count()

# Calculate the average number of connections for ml_target=0 and ml_target=1
med_connections = merged_df.groupby('ml_target')['id'].apply(lambda x: connections.loc[x].median())

print(med_connections)

## Build the Network

In [None]:
G = nx.Graph()

# Add nodes to the graph
for _, row in metadata.iterrows():
    node_id = row['id']
    node_name = row['name']
    ml_target = row['ml_target']

    # Add node to the graph with color based on ml_target
    #color = 'yellow' if ml_target == 1 else 'blue'
    G.add_node(node_id, name=node_name, target=ml_target)
    
    
    
# Add edges to the graph
for _, row in edges.iterrows():
    id_1 = row['id_1']
    id_2 = row['id_2']

    # Add edge between id_1 and id_2
    G.add_edge(id_1, id_2)

In [None]:
#nx.draw_networkx(G)

In [None]:
E = G.number_of_edges()
N = G.order()
print("Number of nodes:", len(G.nodes()))
print("Number of edges:", len(G.edges()))
print ("Average degree=", 2*E/N)
print("Density",nx.density(G))

In [None]:
degree_dct = G.degree()
## Save it as node attribute in the graph object
nx.set_node_attributes(G, dict(degree_dct), name="degree")
metadata['degree'] = metadata["id"].map(dict(degree_dct))
metadata.sort_values("degree",ascending=False).head(10)

In [None]:
#draw a node with degree 5 with all direct neighbours
node_id = metadata[metadata['degree'] == 5]['id'].tolist()[0]
neighbors = list(G.neighbors(node_id))
# Create a subgraph containing the node and its neighbors
subgraph = G.subgraph([node_id] + neighbors)

#print(G.nodes[32769])

# Get node colors based on the 'ml_target' attribute
node_colors = ['orange' if G.nodes[node]['target'] == 1 else 'lightblue' for node in subgraph.nodes]

# Draw the subgraph
pos = nx.spring_layout(subgraph)
nx.draw_networkx(subgraph, pos, with_labels=True, node_color=node_colors, node_size=500)
# Show the plot
plt.axis('off')
plt.show()




In [None]:
metadata["degree"].hist(bins=20,density=True, range=(0, 100))
# plt.plot(np.arange(0,100),stats.poisson.pmf(np.arange(0,100),2*E/N))
plt.xlabel("Degree")
plt.ylabel("Probability")
plt.gca().grid(False)
plt.savefig("plots/hist_degree")
plt.show()

In [None]:
#degreee curve
sorted_m = metadata.sort_values("degree",ascending=False).reset_index().head(15)

plt.plot(sorted_m['degree'])
plt.xlabel('Index (sorted)')
plt.ylabel('Degree')
plt.title('Degree Line Chart')
plt.show()

In [None]:
#Clustering Coefficient
clust_dct = nx.clustering(G)
clustering_coefficient = nx.average_clustering(G)
print("Clustering coefficient:", clustering_coefficient)
nx.set_node_attributes(G,clust_dct,name="clustering")
metadata['clustering'] = metadata.id.map(clust_dct)
metadata.sort_values("clustering",ascending=False).head(10)

In [None]:
#Eigenvector centrality
eigen_dct = nx.eigenvector_centrality(G,max_iter=1000)
nx.set_node_attributes(G,eigen_dct,name="eigenvector")
metadata['eigenvector'] = metadata.id.map(eigen_dct)
metadata.sort_values("eigenvector",ascending=False).head(10)

In [None]:
node_id = 31890
neighbors = list(G.neighbors(node_id))
# Create a subgraph containing the node and its neighbors
subgraph = G.subgraph([node_id] + neighbors)

#print(G.nodes[32769])

# Get node colors based on the 'ml_target' attribute
node_colors = ['orange' if G.nodes[node]['target'] == 1 else 'lightblue' for node in subgraph.nodes]

# Draw the subgraph
pos = nx.spring_layout(subgraph)
nx.draw_networkx(subgraph, pos, with_labels=False, node_color=node_colors, node_size=10)
# Show the plot
plt.axis('off')
plt.show()

In [None]:
#PageRank centrality
pr_dct = nx.pagerank(G,max_iter=1000)
nx.set_node_attributes(G,pr_dct,name="pagerank")
metadata['pagerank'] = metadata.id.map(pr_dct)
metadata.sort_values("pagerank",ascending=False).head(10)

In [None]:
dc_dct = nx.degree_centrality(G)
nx.set_node_attributes(G,dc_dct,name="degree_centrality")
metadata['degree_centrality'] = metadata.id.map(dc_dct)
metadata.sort_values("degree_centrality",ascending=False).head(10)

In [None]:
#Betweenness centrality
betwn_dct = nx.betweenness_centrality(G)
nx.set_node_attributes(G,betwn_dct,name="betweenness")
metadata['betweenness'] = metadata.id.map(betwn_dct)
metadata.sort_values("betweenness",ascending=False).head(10)

In [None]:
#Closeness centrality
close_dct = nx.closeness_centrality(G)
nx.set_node_attributes(G,close_dct,name="closeness")
metadata['closeness'] = metadata.id.map(close_dct)
metadata.sort_values("closeness",ascending=False).head(10)

In [None]:
#Correlation between Features
sns.pairplot(metadata)

#### Community Analysis

In [None]:
#run brew install graph-tool

#import graph_tool
#from graph_tool import draw
#from graph_tool import inference

In [None]:
#k-core decomposition

cores_dct = {}
for ki in range(1,40):
    node_lst_i = nx.k_core(G,ki).nodes
    cores_dct_i = {ni:ki for ni in node_lst_i}
    cores_dct.update(cores_dct_i)
    #print (len(node_lst_i))
    if len(node_lst_i) <= 0:
        break

In [None]:
nx.set_node_attributes(G, cores_dct, name="k-core")
metadata['k-core'] = metadata.id.map(cores_dct)
metadata.sort_values("k-core",ascending=False).head(10)

In [None]:
nx.write_gexf(G,"result_graph.gexf")

In [None]:
plt.hist(metadata["clustering"], bins=20)
plt.xlabel('clustering coefficient')
#plt.ylabel('Y-axis label')
plt.title('Histogram of Clustering')
plt.savefig("plots/hist_clustering")

In [None]:
plt.hist(metadata["eigenvector"], bins=100)
plt.xlabel('eigenvector')
#plt.ylabel('Y-axis label')
plt.title('Histogram of Eigenvector')
plt.savefig("plots/hist_eigenvector")

In [None]:
plt.hist(metadata["k-core"], bins=100)
plt.xlabel('k-core')
#plt.ylabel('Y-axis label')
plt.title('Histogram of K-Core')

In [None]:
metadata.to_csv("data/metadata_p.csv",index=False)