# Graph Analysis Helper functions

In [1]:
import pandas as pd

import sqlalchemy as salc
import json

with open("../../comm_cage.json") as config_file:
    config = json.load(config_file)

In [2]:
def get_repos(repos, engine):

    repo_set = []
    repo_name_set = []
    for repo_git in repos:
        repo_query = salc.sql.text(f"""
                     SET SCHEMA 'augur_data';
                     SELECT 
                        b.repo_id,
                        b.repo_name
                    FROM
                        repo_groups a,
                        repo b
                    WHERE
                        a.repo_group_id = b.repo_group_id AND
                        b.repo_git = \'{repo_git}\'
            """)

        t = engine.execute(repo_query)
        results = t.mappings().all()[0]
        repo_id = results['repo_id']
        repo_name = results['repo_name']
        repo_set.append(repo_id)
        repo_name_set.append(repo_name)
    return repo_set, repo_name_set

In [3]:
def get_issue_contributors(repo_set, engine):

    issue_contrib = pd.DataFrame()
    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    ie.cntrb_id,
                    ie.action,
                    i.issue_id,
                    i.created_at
                    FROM
                    repo r, issues i, issue_events ie
                     WHERE
                    i.repo_id = \'{repo_id}\' AND
                    i.repo_id = r.repo_id AND
                    i.issue_id = ie.issue_id AND
                    ie.action='closed'
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        issue_contrib = pd.concat([issue_contrib, df_current_repo])

    issue_contrib = issue_contrib.reset_index()
    issue_contrib.drop("index", axis=1, inplace=True)
    issue_contrib.columns =['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'action', 'issue_id', 'created_at']
#     issue_contrib['cntrb_id'] = issue_contrib['cntrb_id'].astype('Int64')
    return issue_contrib

In [4]:
def get_pr_contributors(repo_set, engine):

    pr_contrib = pd.DataFrame()

    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    prm.cntrb_id,
                    prm.pull_request_id,
                    pr.pr_created_at
                    FROM
                    repo r, pull_request_meta prm, pull_requests pr
                    WHERE
                    prm.repo_id = \'{repo_id}\' AND
                    prm.repo_id = r.repo_id AND
                    prm.pull_request_id = pr.pull_request_id
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        pr_contrib = pd.concat([pr_contrib, df_current_repo])

    pr_contrib = pr_contrib.reset_index()
    pr_contrib.drop("index", axis=1, inplace=True)
    pr_contrib.columns =['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'pull_request_id', 'pr_created_at']

    return pr_contrib

In [5]:
def get_commit_contributors(repo_set, engine):

    commit_contrib = pd.DataFrame()

    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    ca.cntrb_id,
                    c.cmt_id,
                    c.cmt_date_attempted
                    FROM
                    repo r, commits c, contributors_aliases ca
                    WHERE
                    c.repo_id = \'{repo_id}\' AND
                    c.repo_id = r.repo_id and
                    c.cmt_committer_email = ca.alias_email
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        commit_contrib = pd.concat([commit_contrib, df_current_repo])

    commit_contrib = commit_contrib.reset_index()
    commit_contrib.drop("index", axis=1, inplace=True)
    commit_contrib.columns =['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'cmt_id', 'cmt_date_attempted']
#     commit_contrib['cntrb_id'] = commit_contrib['cntrb_id'].astype('Int64')

    return commit_contrib

In [6]:
def get_prr_contributors(repo_set, engine):

    prr_contrib = pd.DataFrame()

    for repo_id in repo_set:
        repo_query = salc.sql.text(f"""
                    SET SCHEMA 'augur_data';
                    SELECT r.repo_id,
                    r.repo_git,
                    r.repo_name,
                    prr.cntrb_id,
                    prr.pull_request_id
                    FROM
                    repo r, pull_request_reviewers prr
                    WHERE
                    prr.repo_id = \'{repo_id}\' AND
                    prr.repo_id = r.repo_id
            """)
        df_current_repo = pd.read_sql(repo_query, con=engine)
        prr_contrib = pd.concat([prr_contrib, df_current_repo])

    prr_contrib = prr_contrib.reset_index()
    prr_contrib.drop("index", axis=1, inplace=True)
    prr_contrib.columns = ['repo_id', 'repo_git', 'repo_name', 'cntrb_id', 'pull_request_id']
#     prr_contrib['cntrb_id'] = prr_contrib['cntrb_id'].astype('Int64')

    return prr_contrib

In [7]:
def created_melted_dfs(df):

    df = df.groupby(['repo_name', 'cntrb_id']).size().unstack(fill_value=0)
    df = df.reset_index()

    df_melted = df.melt(['repo_name'], var_name = 'cntrb_id',value_name='number')
    df_melted = df_melted[df_melted[df_melted.columns[2]] != 0]

    return df_melted