In [46]:
import os
import datetime
import pytz
import pickle
import numpy as np
import pandas as pd
import networkx as nx
from dotenv import find_dotenv, load_dotenv
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
from neo4j import GraphDatabase
from tqdm.notebook import tqdm

## Database Connection

In [47]:
def initialize_driver():
    load_dotenv(find_dotenv())

    # Get env variables
    uri = os.getenv("NEO4J_URI")
    user = os.getenv("NEO4J_USERNAME")
    password = os.getenv("NEO4J_PASSWORD")

    return GraphDatabase.driver(uri, auth=(user, password),
                                encrypted=False,
                                max_connection_lifetime=3600)

driver = initialize_driver()

## Observations

In [48]:
# Set periods for iteration
OBSERVATION_START = '2021-02-01'  # First date of observation period
DATE_START = '2021-03-01'  # Date of the first snapshot
DATE_END = '2021-10-01'  # Date of the last snapshot
PERIOD_LENGTH = relativedelta(months=1)  # Time between snapshots

def create_observations():
    period = DATE_START
    periods = []
    obs = 0
    while period <= DATE_END:
        periods.append(period)
        next_period = (parse(period) + PERIOD_LENGTH).strftime("%Y-%m-%d")
        period = next_period
        obs += 1

    observations = periods

    [print(i+1, obs) for i, obs in enumerate(observations)]

    return observations

observations = create_observations()

1 2021-03-01
2 2021-04-01
3 2021-05-01
4 2021-06-01
5 2021-07-01
6 2021-08-01
7 2021-09-01
8 2021-10-01


## Sampling & Nodelists

In [49]:
SAMPLE_SIZE = 1000

### Developer

In [50]:
def get_packages(driver, date):
    with driver.session(database='main') as session:
        query = """
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
                WITH pr, COUNT(r) AS num_pkgs
                WHERE num_pkgs = 1
                WITH pr
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr)
                WHERE pa.created < DateTime($date)
                RETURN pa.name AS name,
                       pa.repo_name AS repo_name,
                       pa.repo_owner AS repo_owner,
                       toString(pa.created) AS created
                """
        
        results = session.run(query, date=date).data()
    
        packages = pd.DataFrame.from_dict(results)
        packages['created'] = pd.to_datetime(packages['created'])
        packages['observation'] = pd.to_datetime(date)
    
    return packages

In [51]:
def get_developers(driver, repository, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE r.name = toString($repository)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """
                
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        query_commits = """
            MATCH (u:User)-[a]->(:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        results_comments = session.run(
            query_comments, repository=repository, start=start, end=end).data()
        results_issues = session.run(
            query_issues, repository=repository, start=start, end=end).data()
        results_pullreq = session.run(
            query_pullreq, repository=repository, start=start, end=end).data()
        results_commits = session.run(
            query_commits, repository=repository, start=start, end=end).data()

        comments = pd.DataFrame.from_dict(results_comments)        
        issues = pd.DataFrame.from_dict(results_issues)
        pullreqs = pd.DataFrame.from_dict(results_pullreq)
        commits = pd.DataFrame.from_dict(results_commits)

        developers = pd.concat([comments, issues, pullreqs, commits], axis=0, ignore_index=True)
        developers.drop_duplicates(subset=['login'], inplace=True)

    return developers

In [52]:
# Get list of packages created until start of observation
def get_developer_sample():
    packages = get_packages(driver, OBSERVATION_START)
    repo_names = packages['repo_name'].tolist()
    developers = []
    for repo in tqdm(repo_names):
        repo_devs = get_developers(driver, repo, OBSERVATION_START, DATE_END)
        developers.extend(repo_devs.to_dict(orient="records"))

    devs = pd.DataFrame.from_records(developers)
    devs_sample = devs.drop_duplicates(subset=['login'])
    devs_sample = devs_sample.sample(n=SAMPLE_SIZE, random_state=17, ignore_index=True)

    logins_sample = devs_sample['login'].unique().tolist()
    nodelist_devs = ["_".join(["dev", name]) for name in logins_sample]

    nodelist = pd.DataFrame(nodelist_devs, columns=['id'])
    nodelist.to_csv('../data/nodelists/developers.csv', index=False)
    
    return devs_sample, nodelist_devs
    
sample_devs, nodelist_devs = get_developer_sample()

  0%|          | 0/8085 [00:00<?, ?it/s]

### Packages

In [53]:
def get_developer_affiliations(developer, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE u.login = toString($developer)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """
                
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        query_commits = """
            MATCH (u:User)-[a]->(:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        results_comments = session.run(
            query_comments, developer=developer, start=start, end=end).data()
        results_issues = session.run(
            query_issues, developer=developer, start=start, end=end).data()
        results_pullreq = session.run(
            query_pullreq, developer=developer, start=start, end=end).data()
        results_commits = session.run(
            query_commits, developer=developer, start=start, end=end).data()

        comments = pd.DataFrame.from_dict(results_comments)        
        issues = pd.DataFrame.from_dict(results_issues)
        pullreqs = pd.DataFrame.from_dict(results_pullreq)
        commits = pd.DataFrame.from_dict(results_commits)

        repositories = pd.concat([comments, issues, pullreqs, commits], axis=0, ignore_index=True)
        repositories.drop_duplicates(subset=['repo_name'], inplace=True)

    return repositories

In [54]:
def get_packages_by_repos(repos):
    with driver.session(database='main') as session:
        query = """
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
                WHERE pr.id IN $repos
                RETURN pa.name AS name,
                       pa.repo_name AS repo_name,
                       pa.repo_owner AS repo_owner,
                       toString(pa.created) AS created
                """
        
        results = session.run(query, repos=repos).data()
    
        return pd.DataFrame.from_dict(results)

In [55]:
def get_packages_by_dev_sample(developers):
    repositories = []
    # Get repositories developers contributed to during full observation
    for dev in tqdm(developers):
        dev_repos = get_developer_affiliations(dev, OBSERVATION_START, DATE_END)
        repositories.extend(dev_repos['repo_name'].tolist())

    repositories = list(set(repositories))

    # Keep repositories with one package behind
    packages = get_packages(driver, DATE_END)
    selected_repos = [repo for repo in repositories if repo in packages['repo_name'].tolist()]
    sample_packages = get_packages_by_repos(selected_repos)
    nodelist_pkgs = ["_".join(["pkg", name]) for name in sample_packages['name'].tolist()]

    nodelist = pd.DataFrame(nodelist_pkgs, columns=['id'])
    nodelist.to_csv('../data/nodelists/packages.csv', index=False)

    return sample_packages, nodelist_pkgs
    
sample_packages, nodelist_pkgs = get_packages_by_dev_sample(sample_devs['login'].tolist())

  0%|          | 0/1000 [00:00<?, ?it/s]

## Networks

### Dependencies

In [56]:
def get_latest_version(driver, package, date):
    with driver.session(database='main') as session:
        query = """
                OPTIONAL MATCH (p:Package { id: $package })-[:RELEASED]->(v:Version)
                WHERE v.created < DateTime($date)
                AND NOT v.number CONTAINS "-"
                RETURN p.name AS name,
                       v.id AS version_id,
                       v.number AS version,
                       v.license AS license,
                       toString(v.created) AS version_created
                ORDER BY v.created DESC
                LIMIT 1
                """
        return session.run(query, package=package, date=date).data()

In [57]:
def get_dependencies(driver, version_ids):
    with driver.session(database='main') as session:
        query = """
                UNWIND $versions AS version
                MATCH (v:Version { id: version })-[d:DEPENDS_ON]->(p:Package)
                RETURN v.package_id AS source,
                       p.id AS target,
                       d.requirements AS requirements,
                       toString(v.created) AS created
                """

        results = session.run(query, versions=version_ids).data()

        dependencies = pd.DataFrame.from_dict(results)
        dependencies['created'] = pd.to_datetime(dependencies['created'])

    return dependencies

In [58]:
def create_dependency_networks():
    # Create dependency networks for each observation
    dependency_networks = []

    with tqdm(total=len(observations)) as pbar:
        for obs in observations:
            packages = get_packages(driver, obs)

            latest_versions = []
            for package in tqdm(packages['name'].tolist(), leave=False):
                latest_version = get_latest_version(driver, package, obs)
                try:
                    latest_versions.append(latest_version[0])
                except KeyError:
                    pass  # Package has no version

            versions = pd.DataFrame.from_records(latest_versions)
            versions['version_created'] = pd.to_datetime(versions['version_created'])

            packages = packages.merge(versions, how="left", on=['name'])
            dependencies = get_dependencies(driver, packages['version_id'].tolist())

            # Add prefix to match nodelist
            dependencies['source'] = "pkg_" + dependencies['source']
            dependencies['target'] = "pkg_" + dependencies['target']

            edgelist = list(zip(dependencies['source'], dependencies['target']))

            G = nx.DiGraph()
            G.add_nodes_from(nodelist_pkgs)
            G.add_edges_from(edgelist)

            dependency_networks.append(G)

            nx.write_edgelist(G, '../data/edgelists/dependency_network-{0}.edgelist'.format(obs), delimiter=",", data=False)
            nx.write_gpickle(G, '../data/networks/dependency_network-{0}.pkl'.format(obs))
            nx.write_gml(G, '../data/networks/dependency_network-{0}.gml'.format(obs))

            pbar.update()
        pbar.close()
    return dependency_networks

dependency_networks = create_dependency_networks()

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8099 [00:00<?, ?it/s]

  0%|          | 0/8108 [00:00<?, ?it/s]

  0%|          | 0/8123 [00:00<?, ?it/s]

  0%|          | 0/8134 [00:00<?, ?it/s]

  0%|          | 0/8141 [00:00<?, ?it/s]

  0%|          | 0/8155 [00:00<?, ?it/s]

  0%|          | 0/8171 [00:00<?, ?it/s]

  0%|          | 0/8180 [00:00<?, ?it/s]

### Affiliations

In [59]:
def create_affiliation_networks():
    # Create affiliation networks for each observation
    affiliation_networks = []

    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            affiliations = []
            for dev in sample_devs['login'].tolist():
                if i == 0:
                    dev_affiliations = get_developer_affiliations(dev, OBSERVATION_START, obs)
                else:
                    dev_affiliations = get_developer_affiliations(dev, observations[i-1], obs)
                dev_affiliations['login'] = dev
                affiliations.extend(dev_affiliations.to_dict(orient="records"))

            affiliations = pd.DataFrame.from_records(affiliations)
            # Add package names to affiliation data
            affiliations = affiliations.merge(sample_packages[["repo_name", "name"]], how="left", on="repo_name")
            
            affiliations['source'] = "dev_" + affiliations['login']
            affiliations['target'] = "pkg_" + affiliations['name']
            
            # Keep edges between devs and packages that are in nodelists
            affiliations = affiliations[affiliations['source'].isin(nodelist_devs)]
            affiliations = affiliations[affiliations['target'].isin(nodelist_pkgs)]

            edgelist = list(zip(affiliations['source'], affiliations['target']))
            
            G = nx.DiGraph()
            G.add_nodes_from(nodelist_devs, bipartite=0)
            G.add_nodes_from(nodelist_pkgs, bipartite=1)
            G.add_edges_from(edgelist)

            affiliation_networks.append(G)

            nx.write_edgelist(G, '../data/edgelists/affiliation_network-{0}.edgelist'.format(obs), delimiter=",", data=False)
            nx.write_gpickle(G, '../data/networks/affiliation_network-{0}.pkl'.format(obs))
            nx.write_gml(G, '../data/networks/affiliation_network-{0}.gml'.format(obs))

            pbar.update()
        pbar.close()

    return affiliation_networks

affiliation_networks = create_affiliation_networks()

  0%|          | 0/8 [00:00<?, ?it/s]

## Dependent Network

In [60]:
# Adjacency list for SIENA
def create_adjacency_matrix():
    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            adj = nx.bipartite.biadjacency_matrix(affiliation_networks[i], row_order=nodelist_devs, column_order=nodelist_pkgs)
            am = adj.toarray()
            np.savetxt('../data/adjacency/net-{0}.txt'.format(obs), am, fmt='%s')
            pbar.update()
        pbar.close()

create_adjacency_matrix()

  0%|          | 0/8 [00:00<?, ?it/s]

## Composition Changes

### Packages

In [61]:
def create_package_composition():
    # Composition changes for dependency network
    composition = sample_packages[['name', 'created']].copy()
    composition['name'] = "pkg_" + composition['name']
    composition['appearance'] = 0

    rev_obs = sorted(observations,  reverse=True)
    for i, item in enumerate(rev_obs):
        obs = len(rev_obs) - i
        composition.loc[composition['created'] < item, 'appearance'] = int(obs)

    arr = []
    for i, row in composition.iterrows():
        curr = [int(row['appearance']), len(observations)]
        arr.append(curr)
        
    comp_arr = np.array(arr)
    np.savetxt('../data/compositions/pkgs_comp.txt', comp_arr, fmt='%d')

create_package_composition()

### Developers

In [62]:
def create_developer_composition():
    # Composition changes for dependency network
    composition = sample_devs.copy()
    composition['name'] = "dev_" + composition['login']
    composition['appearance'] = 1
    
    inactive_developers = []
    for i, item in enumerate(observations):
         # Identify developers without outdegree in first period
        devs_degrees = dict(affiliation_networks[i].out_degree(nodelist_devs))
        devs_degrees = pd.DataFrame.from_dict(devs_degrees, orient="index", columns=["out"]).reset_index()
        devs_degrees.rename(columns={"index": "name"}, inplace=True)

        if i == 0: 
            inactive_developers.append(devs_degrees[devs_degrees['out'] == 0]['name'].tolist())
        else:
            devs = devs_degrees[devs_degrees['out'] == 0]['name'].tolist()
            still_inactive_developers = [dev for dev in devs if dev in inactive_developers[i-1]]
            inactive_developers.append(still_inactive_developers)
    
    for j, developers in enumerate(inactive_developers):
        composition.loc[composition['name'].isin(developers), 'appearance'] = int(j + 2)

    arr = []
    for i, row in composition.iterrows():
        curr = [int(row['appearance']), len(observations)]
        arr.append(curr)
        
    comp_arr = np.array(arr)
    np.savetxt('../data/compositions/devs_comp.txt', comp_arr, fmt='%d')

create_developer_composition()

## Behavioral Variables

In [63]:
# TODO: Define and think about what makes sense

## Individual Variables

### Developers

In [64]:
# TODO

### Packages

#### Dependencies

In [39]:
def create_package_dependencies():
    dependencies = sample_packages[['name', 'repo_name']].copy()
    dependencies['name'] = "pkg_" + dependencies['name']

    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            idegree = dict(dependency_networks[i].in_degree())
            odegree = dict(dependency_networks[i].out_degree())
            
            df_idegree = pd.DataFrame.from_dict(idegree, orient="index", columns=["_".join(["in", obs])]).reset_index()
            df_idegree.rename(columns={"index": "name"}, inplace=True)

            df_odegree = pd.DataFrame.from_dict(odegree, orient="index", columns=["_".join(["out", obs])]).reset_index()
            df_odegree.rename(columns={"index": "name"}, inplace=True)

            dependencies = dependencies.merge(df_idegree, how="left", on="name")
            dependencies = dependencies.merge(df_odegree, how="left", on="name")

            pbar.update()
        pbar.close()
        
    columns_in = ["_".join(["in", obs]) for obs in observations]
    columns_out = ["_".join(["out", obs]) for obs in observations]

    dependencies[columns_in] = dependencies[columns_in].astype("Int64")
    dependencies[columns_in] = dependencies[columns_in].astype(str)

    dependencies[columns_out] = dependencies[columns_out].astype("Int64")
    dependencies[columns_out] = dependencies[columns_out].astype(str)

    dependencies.replace(to_replace='<NA>', value='NA', inplace=True)

    np.savetxt('../data/individual/pkg_upstream.txt', dependencies[columns_out].values, fmt='%s')
    np.savetxt('../data/individual/pkg_downstream.txt', dependencies[columns_in].values, fmt='%s')
    
create_package_dependencies()

  0%|          | 0/5 [00:00<?, ?it/s]

## Dyadic Variables