# Graph Analysis Helper functions

In [1]:
import pandas as pd
import collections
from operator import itemgetter
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt

import sqlalchemy as salc
import networkx as nx
import json

with open("../../comm_cage.json") as config_file:
    config = json.load(config_file)

FileNotFoundError: [Errno 2] No such file or directory: '../../comm_cage.json'

In [None]:
def get_repos(repos, engine):

    repo_set = []
    repo_name_set = []
    for repo_git in repos:
        repo_query = salc.sql.text(f"""
                     SET SCHEMA 'augur_data';
                     SELECT 
                        b.repo_id,
                        b.repo_name
                    FROM
                        repo_groups a,
                        repo b
                    WHERE
                        a.repo_group_id = b.repo_group_id AND
                        b.repo_git = \'{repo_git}\'
            """)

        t = engine.execute(repo_query)
        results = t.mappings().all()[0]
        repo_id = results['repo_id']
        repo_name = results['repo_name']
        repo_set.append(repo_id)
        repo_name_set.append(repo_name)
    return repo_set, repo_name_set

In [2]:
def get_issue_contributors(repo_set, engine):

    issue_contrib = pd.DataFrame()
    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    ie.cntrb_id,
                    ie.action,
                    i.issue_id,
                    i.created_at
                    FROM
                    repo r, issues i, issue_events ie
                     WHERE
                    i.repo_id = \'{repo_id}\' AND
                    i.repo_id = r.repo_id AND
                    i.issue_id = ie.issue_id AND
                    ie.action='closed'
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        issue_contrib = pd.concat([issue_contrib, df_current_repo])

    issue_contrib = issue_contrib.reset_index()
    issue_contrib.drop("index", axis=1, inplace=True)
    issue_contrib.columns =['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'action', 'issue_id', 'created_at']
#     issue_contrib['cntrb_id'] = issue_contrib['cntrb_id'].astype('Int64')
    return issue_contrib

In [3]:
def get_repos_outside(engine):

    issue_contrib = pd.DataFrame()
    repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_name,
                    (CASE WHEN REGEXP_LIKE(repo_name, 'https://github.com/open-telemetry/opentelemetry-go|https://github.com/open-telemetry/opentelemetry-specification|https://github.com/open-telemetry/opentelemetry-collector') THEN true ELSE NULL
                    END) AS flag
                    FROM repo r
            """)
    df_current_repo = pd.read_sql(repo_query, con=engine)
        
    print(df_current_repo)
#     issue_contrib['cntrb_id'] = issue_contrib['cntrb_id'].astype('Int64')
    return issue_contrib

In [4]:
def get_pr_contributors(repo_set, engine):

    pr_contrib = pd.DataFrame()

    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    prm.cntrb_id,
                    prm.pull_request_id,
                    pr.pr_created_at
                    FROM
                    repo r, pull_request_meta prm, pull_requests pr
                    WHERE
                    prm.repo_id = \'{repo_id}\' AND
                    prm.repo_id = r.repo_id AND
                    prm.pull_request_id = pr.pull_request_id
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        pr_contrib = pd.concat([pr_contrib, df_current_repo])

    pr_contrib = pr_contrib.reset_index()
    pr_contrib.drop("index", axis=1, inplace=True)
    pr_contrib.columns =['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'pull_request_id', 'pr_created_at']

    return pr_contrib

In [5]:
def get_commit_contributors(repo_set, engine):

    commit_contrib = pd.DataFrame()

    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    ca.cntrb_id,
                    c.cmt_id,
                    c.cmt_date_attempted
                    FROM
                    repo r, commits c, contributors_aliases ca
                    WHERE
                    c.repo_id = \'{repo_id}\' AND
                    c.repo_id = r.repo_id and
                    c.cmt_committer_email = ca.alias_email
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        commit_contrib = pd.concat([commit_contrib, df_current_repo])

    commit_contrib = commit_contrib.reset_index()
    commit_contrib.drop("index", axis=1, inplace=True)
    commit_contrib.columns =['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'cmt_id', 'cmt_date_attempted']

    return commit_contrib

In [6]:
def get_prr_contributors(repo_set, engine):

    prr_contrib = pd.DataFrame()

    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    prr.cntrb_id,
                    prr.pull_request_id
                    FROM
                    repo r, pull_request_reviewers prr
                    WHERE
                    prr.repo_id = \'{repo_id}\' AND
                    prr.repo_id = r.repo_id
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        prr_contrib = pd.concat([prr_contrib, df_current_repo])

    prr_contrib = prr_contrib.reset_index()
    prr_contrib.drop("index", axis=1, inplace=True)
    prr_contrib.columns = ['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'pull_request_id']

    return prr_contrib

In [7]:
def created_melted_dfs(df):

    df = df.groupby(['repo_name', 'cntrb_id']).size().unstack(fill_value=0)
    df = df.reset_index()

    df_melted = df.melt(['repo_name'], var_name = 'cntrb_id',value_name='number')
    df_melted = df_melted[df_melted[df_melted.columns[2]] != 0]

    return df_melted

In [8]:
def get_page_ranks(graph, top, graduated_repos, incubating_repos, sandbox_repos, scores):
    
    """
    This method takes in a graph, and returns the nodes ranked by page rank 
    graph: input graph
    top: top number of repos to subset after calculating the page rank
    known_repos: list of repository/community names known to us
    other_repos: list of repository/community names that we want to determine the importance of
    """
    
    pageranks = nx.pagerank(graph, alpha=0.85, personalization=None, max_iter=100, tol=1e-06, nstart=None, weight='weight', dangling=None)
    
    scores['page_rank'] = scores['repo'].map(pageranks)
    
    page_rank_graduated_repos = collections.defaultdict(int)
    page_rank_incubating_repos = collections.defaultdict(int)
    page_rank_sandbox_repos = collections.defaultdict(int)

    for key in pageranks:
        if key in graduated_repos:
            page_rank_graduated_repos[key] = pageranks[key]

        elif key in incubating_repos:
            page_rank_incubating_repos[key] = pageranks[key]
        elif key in sandbox_repos:
            page_rank_sandbox_repos[key] = pageranks[key]
    
    top_page_rank_graduated_repos = dict(sorted(page_rank_graduated_repos.items(), key = itemgetter(1), reverse = True)[:top])
    top_page_rank_incubating_repos = dict(sorted(page_rank_incubating_repos.items(), key = itemgetter(1), reverse = True)[:top])
    top_page_rank_sandbox_repos = dict(sorted(page_rank_sandbox_repos.items(), key = itemgetter(1), reverse = True)[:top])
    
    return top_page_rank_graduated_repos, top_page_rank_incubating_repos, top_page_rank_sandbox_repos, pageranks, scores
    

In [9]:
def get_betweenness_centrality(graph, top, graduated_repos, incubating_repos, sandbox_repos, scores):
    
    """
    This method takes in a graph, and returns the nodes ranked by betweenness centrality scores
    graph: input graph
    top: top number of repos to subset after calculating the betweenness centrality scores
    known_repos: list of repository/community names known to us
    other_repos: list of repository/community names that we want to determine the importance of
    """
    
    # Betweenness centrality measures the extent to which a node lies on paths between other nodes in the graph. 
    # Nodes with higher betweenness have more influence within a network. 
    # Thus repositories with higher centrality scores can thought to be influential in connection to other repositories in the network.

    bw_centrality = nx.betweenness_centrality(graph)
    
    scores['betweenness_centrality'] = scores['repo'].map(bw_centrality)

    bw_centrality_graduated_repos = collections.defaultdict(int)
    bw_centrality_incubating_repos = collections.defaultdict(int)
    bw_centrality_sandbox_repos = collections.defaultdict(int)

    for key in bw_centrality:
        if key in graduated_repos:
            bw_centrality_graduated_repos[key] = bw_centrality[key]

        elif key in incubating_repos:
            bw_centrality_incubating_repos[key] = bw_centrality[key]
        
        elif key in sandbox_repos:
            bw_centrality_sandbox_repos[key] = bw_centrality[key]
    
    top_bw_centrality_graduated_repos = dict(sorted(bw_centrality_graduated_repos.items(), key = itemgetter(1), reverse = True)[:top])
    top_bw_centrality_incubating_repos = dict(sorted(bw_centrality_incubating_repos.items(), key = itemgetter(1), reverse = True)[:top])
    top_bw_centrality_sandbox_repos = dict(sorted(bw_centrality_sandbox_repos.items(), key = itemgetter(1), reverse = True)[:top])
    
    return top_bw_centrality_graduated_repos, top_bw_centrality_incubating_repos, top_bw_centrality_sandbox_repos, bw_centrality, scores
    

In [10]:
def get_closeness_centrality(graph, top, graduated_repos, incubating_repos, sandbox_repos, scores):
    
    """
    This method takes in a graph, and returns the nodes ranked by closeness centrality scores
    graph: input graph
    top: top number of repos to subset after calculating the closeness centrality scores
    known_repos: list of repository/community names known to us
    other_repos: list of repository/community names that we want to determine the importance of
    """
    
    c_centrality = nx.closeness_centrality(graph)
    
    scores['closeness_centrality'] = scores['repo'].map(c_centrality)
    
    c_centrality_graduated_repos = collections.defaultdict(int)
    c_centrality_incubating_repos = collections.defaultdict(int)
    c_centrality_sandbox_repos = collections.defaultdict(int)

    for key in c_centrality:
        if key in graduated_repos:
            c_centrality_graduated_repos[key] = c_centrality[key]

        elif key in incubating_repos:
            c_centrality_incubating_repos[key] = c_centrality[key]
        
        elif key in sandbox_repos:
            c_centrality_sandbox_repos[key] = c_centrality[key]
    
    top_c_centrality_graduated_repos = dict(sorted(c_centrality_graduated_repos.items(), key = itemgetter(1), reverse = True)[:top])
    top_c_centrality_incubating_repos = dict(sorted(c_centrality_incubating_repos.items(), key = itemgetter(1), reverse = True)[:top])
    top_c_centrality_sandbox_repos = dict(sorted(c_centrality_sandbox_repos.items(), key = itemgetter(1), reverse = True)[:top])
    
    return top_c_centrality_graduated_repos, top_c_centrality_incubating_repos, top_c_centrality_sandbox_repos, c_centrality, scores
    

In [11]:
def plot_graph(graph, graduated_repos, incubating_repos, sandbox_repos, size, title, weights=None, with_labels=True, alpha=None, edge_color='k'):
    
    """
    graph: the networkX graph that we want to plot
    known_repos: list of known repos for coloring
    other_repos: list of other repos for coloring
    size: can be either 'weighted', 'equal' or 'conditional'
    When size is 'weighted', the node sizes on the graph are based on the weights provided
    When size is 'equal', all nodes are the same size
    When size is 'conditional', nodes which belong to the weights array are larger than the rest of the nodes
    weights: this decides the size of the nodes in the 'weighted' and 'conditional' type sizes
    
    here we plot a networkx graph based on the provided parameters
    """
    blue_patch = mpatches.Patch(color='blue', label='Graduated Repositories')
    green_patch = mpatches.Patch(color='green', label='Incubating Repositories')
    red_patch = mpatches.Patch(color='red', label='Sandbox Repositories')
    yellow_patch = mpatches.Patch(color='green', label='Contributors')

    nodes = graph.nodes()
    colors = []
    for n in nodes:
        if n in graduated_repos:
            colors.append('blue')
        elif n in incubating_repos:
            colors.append('green')
        elif n in sandbox_repos:
            colors.append('red')
        else:
            colors.append('yellow')
            
    if size == 'weighted':
        node_sizes = [v * 10000 for v in weights.values()]
    elif size == 'conditional':
        node_sizes = [1000 if ns in weights else 50 for ns in nodes]
    elif size == 'equal':
        node_sizes = 300

    fig, ax = plt.subplots(figsize=(15,15))

    font = {"color": "k", "fontsize": 15}
    
    ax.set_title(title, font)
    ax.legend(handles=[yellow_patch, blue_patch, green_patch, red_patch])
    
    nx.draw_networkx(graph, node_color=colors, node_size=node_sizes, font_size=9, ax=ax, with_labels=with_labels, alpha=alpha, edge_color=edge_color)

In [12]:
def project_nodes_edges_contributions(df):
    
    """
    Using this function we represent data as a graph where the project repositories are represented by nodes 
    and the edges are shared contributions between those projects
    """
 
    # structure of `contributorGraph` =  
    # {  
    # `contributor1`: [(`repo1`, `contributions by the contributor1 in repo 1`)],  
    #  `contributor2`: [(`repo2`, `contributions by the contributor2 in repo 2` ), (`repo1`, `contributions by the contributor2 in repo 1`)]  
    # }

    contributorGraph = {}
    for i, row in df.iterrows():
        if row['cntrb_id'] not in contributorGraph:
            contributorGraph[row['cntrb_id']] = []
        if(row['total_contributions'] > 0):
            contributorGraph[row['cntrb_id']].append((row['repo_name'], row['total_contributions']))
            
    # `contributorGraph`  is a dictionary where each key is a contributor, 
    #  and the value is a list of repositories the contributor has contributed to and the number of contributions it has made.
    
    #  "shared connections" constitute of commits, PRs, issues* and PR reviews that are made by the same contributor.
    #  2 project repositories are "connected" if they have a "shared connection"** between them. 
    #  If they have a contributor who makes a commit, PR, issue or PR review in both the repositories, 
    #  they count as a shared contributor and the repositories are connected. 
    
    commonRepoContributionsByContributor = collections.defaultdict(int)
    for key in contributorGraph:
        if len(contributorGraph[key])-1 <= 0:
            continue
        for repoContributionIndex in range(len(contributorGraph[key])-1):
            commonRepoContributionsByContributor[(contributorGraph[key][repoContributionIndex][0], contributorGraph[key][repoContributionIndex+1][0])] += contributorGraph[key][repoContributionIndex][1]+contributorGraph[key][repoContributionIndex+1][1]

    # `commonRepoContributionsByContributor` is a nested dictionary consisting of dictionaries of repository pairs and their common contributions. 
    #  structure of `commonRepoContributionsByContributor` =  
    #  {  
    #  (`repo1, repo2`): `PRs by same authors in repo 1 and repo 2`,  
    #  (`repo2, repo4`): `PRs by same authors in repo 2 and repo 4`,  
    #  (`repo2, repo5`): `PRs by same authors in repo 2 and repo 5`,   
    #   }    
    
    res = []
    for key in commonRepoContributionsByContributor:
        res.append(tuple(str(k) for k in list(key)) + (commonRepoContributionsByContributor[key],))
        
    return res, commonRepoContributionsByContributor