In [162]:
import altair as alt
import pandas as pd
from vega_datasets import data
import networkx as nx
import matplotlib.pyplot as plt
import nx_altair as nxa
from itertools import chain
from pyvis.network import Network
import numpy as np
import random
import copy

In [None]:
#!pip install nx_altair

#### First visualization: Topics as nodes and connections by shared repositories
Based on: https://altair-viz.github.io/gallery/airport_connections.html

In [124]:
# Read our cleaned data
repos = pd.read_csv('../cleaned_data/repo_info_stop75.csv', index_col=0).drop_duplicates(subset=['name'])
contributors = pd.read_csv('../raw_data/10_contributor_info_all.csv', index_col=0).drop_duplicates(subset=['login'])
repo_contrib_relation = pd.read_csv('../raw_data/repo_contributor_relationship_table_all.csv', index_col=0)
repo_topic_relation = pd.read_csv('../cleaned_data/topic_relationship_table_stop75.csv', index_col=0).reset_index(drop=True)

In [125]:
repos.loc[248775516]

name                                                       WebCompiler
stargazers_count                                                   100
forks_count                                                         23
subscribers_count                                                    6
topics               ['cross-platform', 'dotnet', 'js', 'minificati...
language                                                            C#
created_at                                        2020-03-20T14:28:20Z
updated_at                                        2022-04-11T01:40:55Z
url                                             /excubo-ag/WebCompiler
search_word                                                   compiler
Name: 248775516, dtype: object

In [None]:
#get list of repos for each simplified topic
u = repo_topic_relation.groupby("topic_simple")["id"].agg(list)

#get list of topics included in each simplified topic
t = repo_topic_relation.groupby("topic_simple")["topic"].agg(list)
t1 = [(len(i),set(i)) for i in t]
t1.sort()

In [None]:
# Make histogram of distibution of number of repositories that share a topic

l = [(len(r),t, r) for t,r in zip(u.index, u)]
l.sort()
plt.hist([le for le, t, r in l[-100:]])
plt.ylabel('Number of topics')
plt.xlabel('Number of repos')
plt.show()

In [None]:
#create a set of edges to build of a graph from, where two topics will have an edge if the number of shared repositories is above a threshold

threshold = 30 # number of repos shared between two topics to have an edge
top_n = 200 # number of topics to look at

edges = set()
topics = [(le,t) for le,t,r in l[-top_n:]]
topics.sort(reverse=True)


for l1, topic1, repo1 in l[-top_n:]:
    for l2, topic2, repo2 in l[-top_n:]:
        shared_repos = len(set(repo1)&set(repo2))
        if (topic1 != topic2) & (shared_repos>threshold) & ((topic1,topic2,shared_repos) not in edges) & ((topic2,topic1,shared_repos) not in edges):
            edges.add((topic1,topic2,shared_repos))
print('Number of edges found:', len(edges), '. Number of topics included:',top_n)           

In [None]:
#colors = ['3FEDED','2B8FD2','4A2A25','2A3601','16D64D','08195A','9BC30E','7DB78F','FD70F9','BFB7D1','AFC63B','F7DD27','104DEF','61731A','09CEA8','46D722','53E5ED','C1C45E','6D4EA3','FF74C4','B053B5','235FDC','015DEE','7D0BDF','AD86A9','0F91A9','11D2E7','6E1EA7','6D7333','68C333','B40834','D3C6C1','37F46C','A56AC4','D33DA5','0141CB','1F4F65','374D37']
#color_searchword_dict = dict(topics[i]:colors[i] for i in range(len(topics)))
boldLabel_lim = topics[round(top_n/10)][0]

G = nx.Graph(nodes=topics)
for le, t in topics:
    if le>boldLabel_lim:
        G.add_node(t, name=t, size=le*0.03, mass=le*0.005, bold=True)
    else:
        G.add_node(t, name=t, size=le*0.03, mass=le*0.005)

for e in edges:
    G.add_edge(e[0],e[1],width=e[2]*0.03)

In [None]:
net = Network(height=800, width=800, notebook=True)
net.from_nx(G)
net.toggle_physics(True)
options = {
          "edges":{
              "color":{"inherit": True},
              "font":{"size":0},
              "smooth":False
          },
          "interaction":{   
               "selectConnectedEdges": True

}}
net.options=options
#net.show_buttons(filter_='edges')
net.show('example.html')

In [None]:
# calculate layout positions, for example using Graphviz's 'twopi' algorithm, calculated via networkx's API.  
pos = nx.random_layout(G)


# Step 2: Convert graph data from NetworkX's format to the pandas DataFrames expected by Altair

pos_df = pd.DataFrame.from_records(dict(node_id=k, x=x, y=y) for k,(x,y) in pos.items())
node_df = pd.DataFrame.from_records(dict(data, **{'node_id': n}) for n,data in G.nodes.data())
edge_data = ((dict(d, **{'edge_id':i, 'end':'source', 'node_id':s}),
              dict(d, **{'edge_id':i, 'end':'target', 'node_id':t}))
             for i,(s,t,d) in enumerate(G.edges.data()))
edge_df = pd.DataFrame.from_records(chain.from_iterable(edge_data))


# Step 3:  Use Altair to encode the graph data as marks in a visualization
x,y = alt.X('x:Q', axis=None), alt.Y('y:Q', axis=None)
# use a lookup to tie position data to the other graph data
select_repo = alt.selection_single(
     # Display the topic keyword on mouseover (assuming we grouped by topic)
     on="mouseover", nearest=True, fields=["name"], empty="none"
 )

node_position_lookup = {
    'lookup': 'node_id', 
    'from_': alt.LookupData(data=pos_df, key='node_id', fields=['x', 'y'])
}
nodes = (
    alt.Chart(node_df)
    .mark_circle(size=300, opacity=1)
    .encode(x=x, y=y, color=alt.Color('rank:N', legend=None))
    .transform_lookup(**node_position_lookup)
    .add_selection(select_repo)
)
edges = (
    alt.Chart(edge_df)
    .mark_line(color='gray')
    .encode(x=x, y=y, detail='edge_id:N')  # `detail` gives one line per edge
    .transform_lookup(**node_position_lookup)
)
chart = (
    (edges+nodes)
    .properties(width=900, height=500,)
    .configure_view(strokeWidth=0)
)
chart


In [None]:
pos_df

In [None]:
# Was originally the background image for the airports
states = alt.topo_feature(data.us_10m.url, feature="states")

# Original example, TODO: remove
airports = data.airports.url
flights_airport = data.flights_airport.url

# Original example, TODO: remove
states = alt.topo_feature(data.us_10m.url, feature="states")

# Original example, TODO: remove
select_city = alt.selection_single(
    # Decides which blob is selected by the mouse hover-over.
    # Can maybe forgo this for the static graph
    on="mouseover", nearest=True, fields=["origin"], empty="none"
)

#### Second visualization: Topics as nodes and connections by shared contributors

In [None]:
# Read our cleaned data
repos = pd.read_csv('../cleaned_data/repo_info_stop75.csv', index_col=0)
contributors = pd.read_csv('../raw_data/10_contributor_info_all.csv', index_col=0)
repo_contrib_relation = pd.read_csv('../raw_data/repo_contributor_relationship_table_all.csv', index_col=0)
repo_topic_relation = pd.read_csv('../cleaned_data/topic_relationship_table_stop75.csv', index_col=0).reset_index(drop=True)

In [None]:
#get list of repos for each simplified topic
u = repo_topic_relation.groupby("topic_simple")["id"].agg(list)

#get list of contributors for each repo
v = repo_contrib_relation.groupby("Repo")['Contributor'].agg(list)
repo_contrib_dict = dict(zip(v.index,v))

In [None]:
#get list of topics to contributors
l = list()
for t,r in zip(u.index, u):
    current_t = []
    for repo in r:
        if repo in repo_contrib_dict:
            current_t = current_t + repo_contrib_dict[repo]
    l.append((len(set(current_t),t,set(current_t)))
l.sort()

In [None]:
#create a set of edges to build of a graph from, where two topics will have an edge if the number of shared contributors is above a threshold

threshold = 300 # number of repos shared between two topics to have an edge
top_n = 200 # number of topics to look at

edges = set()
topics = [(t,le) for le,t,r in l[-top_n:]]


for l1, topic1, repo1 in l[-top_n:]:
    for l2, topic2, repo2 in l[-top_n:]:
        shared_repos = len(set(repo1)&set(repo2))
        if (topic1 != topic2) & (shared_repos>threshold) & ((topic1,topic2,shared_repos) not in edges) & ((topic2,topic1,shared_repos) not in edges):
            edges.add((topic1,topic2,shared_repos))
print('Number of edges found:', len(edges), '. Number of topics included:',top_n)           

In [None]:
G = nx.Graph(nodes=topics)
for t, le in topics:
    G.add_node(t, name=t, size=le*0.005)

for e in edges:
    G.add_edge(e[0],e[1],width=e[2]*0.01)
    
net = Network(height=800, width=800, notebook=True)

net.from_nx(G)

net.toggle_physics(True)
options = {
          "edges":{
              "color":{"inherit": True},
              "font":{"size":0},
              "smooth":False
          },
          "interaction":{   
               "selectConnectedEdges": True

}}
net.options=options
#net.show_buttons()
net.show('example.html')

#### Third visualization: Repositories as nodes and connections by shared contributors, limiting data to a specific searchword

In [None]:
searchwords = ['3D', 'Algorithm', 'Android', 'API', 'Arduino', 'Atom', 'aws',
       'azure', 'bash', 'bootstrap', 'chrome', 'compiler',
       'crytocurrency', 'data structures', 'database',
       'data visualization', 'deep learning', 'data science',
       'deployment', 'flask', 'front end', 'git', 'google', 'iOS', 'json',
       'library', 'machine learning', 'macOS', 'mobile', 'modeling',
       'natural language processing', 'neural network',
       'operating system', 'parsing', 'software', 'server',
       'virtual reality', 'windows']

colors = ['3FEDED','2B8FD2','4A2A25','2A3601','16D64D','08195A','9BC30E','7DB78F','FD70F9','BFB7D1','AFC63B','F7DD27','104DEF','61731A','09CEA8','46D722','53E5ED','C1C45E','6D4EA3','FF74C4','B053B5','235FDC','015DEE','7D0BDF','AD86A9','0F91A9','11D2E7','6E1EA7','6D7333','68C333','B40834','D3C6C1','37F46C','A56AC4','D33DA5','0141CB','1F4F65','374D37']
color_searchword_dict = {searchwords[i]:colors[i] for i in range(len(searchwords))}

# Change searchword to limit network
searchword = 'Algorithm'
repos_limit = list(repos[repos.search_word==searchword].index)

#get list of contributors for each repo
v = repo_contrib_relation.groupby("Repo")['Contributor'].agg(list)
repo_contrib_dict = dict(zip(v.index,v))
l = [(len(r),t, r) for t,r in zip(v.index, v)]
l.sort()
#[repo_contrib_relation.Repo.isin(repos_limit)]

In [None]:
#create a set of edges to build of a graph from, where two topics will have an edge if the number of shared repositories is above a threshold

l = [i for i in l if i[0]>9]

threshold = 2 # number of repos shared between two topics to have an edge
top_n = len(l) # number of topics to look at

edges = list()
edges_short = set()
topics = [(t,le) for le,t,r in l[-top_n:]]

for l1, repo1, contrib1 in l[-top_n:]:
    for l2, repo2, contrib2 in l[-top_n:]:
        shared_contribs = list(set(contrib1)&set(contrib2))
        shared_contribs_len = len(shared_contribs)
        if (repo1 != repo2) & (shared_contribs_len>threshold) & ((repo1,repo2) not in edges_short) & ((repo2,repo1) not in edges_short):
            edges_short.add((repo1,repo2))
            edges.append((repo1,repo2,shared_contribs_len,shared_contribs))
print('Number of edges found:', len(edges), '. Number of repositories included:',top_n)          

In [None]:
for t,l in topics:
    if type(t) != int:
        print(t)

In [None]:
#edges1 = [e for e in edges]
edges2 = [e for e in edges]

In [133]:
G = nx.Graph(nodes=topics)
for t, le in topics:
    if t in repos.index:
        G.add_node(t, name=t,size=min(50,0.5*int(np.sqrt(repos.loc[t,'stargazers_count']+1))), color=color_searchword_dict[repos.loc[t,'search_word']])
    else:
        G.add_node(t, name=t, size=1)

for e in edges:
    contrib_followers = [contributors.loc[c,'followers'] for c in e[3] if c in contributors.index]
    if contrib_followers:
        G.add_edge(e[0],e[1],width=min(35,0.1*int(np.sqrt(np.average(contrib_followers)))), inherit=False)
    else:
        G.add_edge(e[0],e[1],width=1, inherit=False)
def remove_small_components(graph, n=3):
    for component in list(nx.connected_components(graph)):
        if len(component)<n:
            for node in component:
                graph.remove_node(node)
    return graph
    
#optional remove small components
G = remove_small_components(G, n=10)

net = Network(height=800, width=800, notebook=True)
net.from_nx(G)
net.toggle_physics(True)
options = {
          "nodes":{
              "font":{"size":0}
          },
          "edges":{
              "color":{"inherit": False},
              "font":{"size":0},
              "smooth":False
          },
          "physics":{
              "barnesHut": {"damping": 1}
          },
          "interaction":{   
               "selectConnectedEdges": True

}}
#net.options=options

net.show_buttons()
net.show('example.html')

NameError: name 'topics' is not defined

In [134]:
def random_color_gen():
    return ("#"+''.join([random.choice('ABCDEF0123456789') for i in range(6)]))

def rc_width_func(df, x):
    contrib_followers = [df.loc[c,'followers'] for c in x if c in df.index]
    if contrib_followers:
        return min(35,0.1*int(np.sqrt(np.average(contrib_followers))))
    else:
        return 1


In [176]:
class social_network:
    """
    The social network class is the main object of this data visualization.
    """
    
    def __init__(self):
        
        #load all scraped datasets
        self.repos = pd.read_csv('../cleaned_data/repo_info_stop75.csv', index_col=0).drop_duplicates(subset=['name'])
        self.contributors = pd.read_csv('../raw_data/10_contributor_info_all.csv', index_col=0).drop_duplicates(subset=['login'])
        self.repo_contrib_relation = pd.read_csv('../raw_data/repo_contributor_relationship_table_all.csv', index_col=0)
        self.repo_topic_relation = pd.read_csv('../cleaned_data/topic_relationship_table_stop75.csv', index_col=0).reset_index(drop=True)
        
        #create an empty dictionary for future edge lists
        self.edgeList_dict = dict()
        self.get_mappings()
        
        self.tr_options = {
                  "edges":{
                      "color":{"inherit": True},
                      "font":{"size":0},
                      "smooth":False
                  }
        }
        self.rc_options = {
                  "nodes":{
                      "font":{"size":0}
                  },
                  "edges":{
                      "color":{"inherit": False},
                      "font":{"size":0},
                      "smooth":False
                  },
                  "physics":{
                      "barnesHut": {"damping": 1}
                  }
        }
        self.tc_options = {
                  "edges":{
                      "color":{"inherit": True},
                      "font":{"size":0},
                      "smooth":False
                  }
        }
        
        self.options = {'topic-repo':self.tr_options,
                       'topic-contributor':self.tc_options,
                       'repo-contributor':self.rc_options}

        self.width_functions = {'topic-repo':lambda x,y: x*0.03,
                               'topic-contributor':lambda x,y: x*0.01,
                               'repo-contributor':lambda x,y: rc_width_func(self.contributors, y)
                               }
        self.size_functions = {'topic-repo':lambda x,y: y*0.03,
                               'topic-contributor':lambda x,y: y*0.005,
                               'repo-contributor':lambda x,y: min(50,0.5*int(np.sqrt(self.repos.loc[x,'stargazers_count']+1))) \
                                                              if x in list(self.repos.index) else 1
                               }
        self.mass_functions = {'topic-repo':lambda x,y: y*0.005,
                               'topic-contributor':lambda x,y: y*0.005,
                               'repo-contributor':lambda x,y: y*0.03
                               }
        self.color_functions = {'topic-repo':lambda x: {"border": "rgba(126,80,233,1)","background": "rgba(225,213,252,1)"},
                               'topic-contributor':lambda x: {"border": "rgba(126,80,233,1)","background": "rgba(225,213,252,1)"},
                               'repo-contributor':lambda x: self.searchword_colors[self.repos.loc[x,'search_word']] \
                                                            if x in list(self.repos.index) else {"border": "black","background": "black"}
                               }
        
        self.searchword_colors = {sw:random_color_gen() for sw in list(self.repos.search_word.unique())}
        
    def change_width_function(self, option, func):
        """
        This method changes the width function for edges given the type of network option.
        
        param option: Specifies which key in the width_function dictionary to change 
                      ('topic-repo', 'topic-contributor','repo-contributor')
        param func: A function that takes in 1 number (int or float) as input
        """
        self.width_functions[option] = func
        
    def change_size_function(self, option, func):
        """
        This method changes the size function for nodes given the type of network option.
        
        param option: Specifies which key in the size_function dictionary to change 
                      ('topic-repo', 'topic-contributor','repo-contributor')
        param func: A function that takes in 1 number (int or float) as input
        """
        self.size_functions[option] = func
    
    def change_mass_function(self, option, func):
        """
        This method changes the mass function for nodes given the type of network option.
        
        param option: Specifies which key in the mass_function dictionary to change 
                      ('topic-repo', 'topic-contributor','repo-contributor')
        param func: A function that takes in 1 number (int or float) as input
        """
        self.mass_functions[option] = func
        
    def remove_small_components(self, graph, n=3):
        for component in list(nx.connected_components(graph)):
            if len(component)<n:
                for node in component:
                    graph.remove_node(node)
        return graph
    
    def get_mappings(self):
        """
        This method creates lists for mappings for the three type of networks. 
        
        TR (topic-repo): Maps simplified topics to list of repositories. 
                         [(# of repos, topic, list of repos)]
        TC (topic-contributor): Maps simplified topics to list of contributors to repositories with given topic tag.
                                [(# of contributors, topic, list of contributors)]
        RC (repo-contributor): Maps repositories to list of contributors. 
                               [(# of contributors, repo, list of contributors)]
        """
        #get list of repos for each simplified topic
        self.tr_list = self.repo_topic_relation.groupby("topic_simple")["id"].agg(list)
        self.TR = [(len(r),t, r) for t,r in zip(self.tr_list.index, self.tr_list)]
        self.TR.sort(reverse=True)
        
        #get list of contributors for each repo
        self.rc_list = self.repo_contrib_relation.groupby("Repo")['Contributor'].agg(list)
        self.repo_contrib_dict = dict(zip(self.rc_list.index,self.rc_list))
        self.RC = [(len(c),r, c) for r,c in zip(self.rc_list.index, self.rc_list)]
        self.RC.sort(reverse=True)
        
        #get list of contributors for each simplified topic
        self.TC = list()
        for t,r in zip(self.tr_list.index, self.tr_list):
            current_t = []
            for repo in r:
                if repo in self.repo_contrib_dict:
                    current_t = current_t + self.repo_contrib_dict[repo]
            self.TC.append((len(set(current_t)),t,set(current_t)))
        self.TC.sort(reverse=True)
        
    def get_edge_list(self, option, threshold=20, top_n='all'):
        """
        This method takes a list of mappings from self.mappings() and creates an edge list.
        
        param option: Specifies which network mapping to use. 
                      ('topic-repo', 'topic-contributor','repo-contributor')
        param threshold (int): threshold value for number of shared repos/contributors for edge to be added to edge list.
        param top_n (int): (Default='all') If not 'all', takes top_n nodes(repos or topics) sorted on number of (contributors or repos).
        """
        
        if f"{option}_{threshold}_{top_n}" not in self.edgeList_dict:
            edges = list()
            edges_short = set()
            
            if option == 'topic-repo':
                if top_n != 'all':
                    mapping = self.TR[:top_n]
                else:
                    mapping = self.TR
                      
            elif option == 'topic-contributor':
                if top_n != 'all':
                    mapping = self.TC[:top_n]
                else:
                    mapping = self.TC
                
            elif option == 'repo-contributor':
                if top_n != 'all':
                    mapping = self.RC[:top_n]
                else:
                    mapping = self.RC
            else:
                raise ValueError
            
            for l1, t1, r1 in mapping:
                for l2, t2, r2 in mapping:
                    shared = set(r1)&set(r2)
                    shared_len = len(shared)
                    if (t1 != t2) & (shared_len>threshold) & ((t1,t2) not in edges_short) & ((t2,t1) not in edges_short):
                        edges_short.add((t1,t2))
                        edges.append((t1,t2,shared_len,shared))
                        
            self.edgeList_dict[f"{option}_{threshold}_{top_n}"]=edges
        
    def make_network(self, option, threshold=20, top_n='all'):
        """
        This method uses an edgelist generated with given threshold and top_n parameters, to make a networkx Graph. 
        If edge list for parameters does not exist yet, first create edge list with self.get_edge_list(). 
        
        param option: Specifies which network mapping to use. 
                      ('topic-repo', 'topic-contributor','repo-contributor')
        param threshold (int): threshold value for number of shared repos/contributors for edge to be added to edge list.
        param top_n (int): (Default='all') If not 'all', takes top_n nodes(repos or topics) sorted on number of (contributors or repos).
        """

        if f"{option}_{threshold}_{top_n}" not in self.edgeList_dict:
            self.get_edge_list(option, threshold=threshold, top_n=top_n)
        self.G = nx.Graph()
        edges = self.edgeList_dict[f"{option}_{threshold}_{top_n}"]
        
        if option == 'topic-repo':
            if top_n != 'all':
                mapping = self.TR[:top_n]
            else:
                mapping = self.TR
                      
        elif option == 'topic-contributor':
            if top_n != 'all':
                mapping = self.TC[:top_n]
            else:
                mapping = self.TC

        elif option == 'repo-contributor':
            if top_n != 'all':
                mapping = self.RC[:top_n]
            else:
                mapping = self.RC
        nodes = [(t,le) for le,t,r in mapping]
        
        for n, le in nodes:
            self.G.add_node(n, name=n, 
                            size=self.size_functions[option](n,le),  
                            color=self.color_functions[option](n)
                           )

        for e in edges:
            self.G.add_edge(e[0],e[1],width=self.width_functions[option](e[2], e[3]), toggle_smoothness="curvedCCW")
            
    def visualize_network(self, option, show_buttons=False, html='network.html',n=0,k=0, new_options=None):
        """
        This method uses the python module PyVis to visualize graph, G, built in self.make_network().
        
        param option: Specifies which network mapping to use. 
                      ('topic-repo', 'topic-contributor','repo-contributor')
        param show_buttons: (Default=False) Displays or hides certain widgets to dynamically modify the network.
        param html: the name of the html file to save network visual to
        param n(int): Remove components with less n nodes
        param k(int): Remove nodes lower than k-degree
        param new_options(dict): Uses these options instead of default options for PyVis visualization
        """
        G = copy.deepcopy(self.G)
        
        if n>0:
            G = self.remove_small_components(G, n=n)
        if k>0:
            G = nx.k_core(G,k=k)
        
        net = Network(height=800, width=800, notebook=True)
        net.from_nx(G)
        net.toggle_physics(True)
        if show_buttons:
            net.show_buttons()
        else:
            net.options=new_options if new_options else self.options[option]
        net.show(html)
        

In [177]:
sn = social_network()

In [None]:
#sn.get_edge_list('topic-repo', threshold=30, top_n=200)
#sn.get_edge_list('topic-contributor', threshold=200, top_n=1000)
sn.get_edge_list('repo-contributor', threshold=3, top_n=10000)

In [None]:
sn.edgeList_dict.keys()

In [None]:
opt = {"edges":{
              "color":{"inherit": True},
              "font":{"size":0},
              "smooth":False
          }
}

rc_

In [171]:
sn.make_network('repo-contributor', threshold=3, top_n=10000)

In [175]:
sn.visualize_network('repo-contributor', html = 'rc_network.html', n=5, k=3, new_options=opt)

In [None]:

for k in range(10):
    sn.visualize_network('repo-contributor', html = f'rc_network_k{k}.html', n=5, k=k)
#sn.visualize_network('topic-contributor', html = 'tc_network.html')
#sn.visualize_network('topic_repo', html = 'tr_network.html', n=2)