In [1]:
import os
import datetime
import pytz
import pickle
import numpy as np
import pandas as pd
import networkx as nx
from dotenv import find_dotenv, load_dotenv
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
from neo4j import GraphDatabase
from tqdm.notebook import tqdm

##  Configuration

In [2]:
SAMPLE_SIZE = 250
WORK_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
OUTPUT = os.path.join(WORK_DIR, "data")

## Database Connection

In [3]:
def initialize_driver():
    load_dotenv(find_dotenv())

    # Get env variables
    uri = os.getenv("NEO4J_URI")
    user = os.getenv("NEO4J_USERNAME")
    password = os.getenv("NEO4J_PASSWORD")

    return GraphDatabase.driver(uri, auth=(user, password),
                                encrypted=False,
                                max_connection_lifetime=3600)

driver = initialize_driver()

## Observations

In [4]:
# Set periods for iteration
OBSERVATION_START = '2021-01-01'  # First date of observation period
DATE_START = '2021-02-01'  # Date of the first snapshot
DATE_END = '2021-05-01'  # Date of the last snapshot
PERIOD_LENGTH = relativedelta(months=1)  # Time between snapshots

def create_observations():
    period = DATE_START
    periods = []
    obs = 0
    while period <= DATE_END:
        periods.append(period)
        next_period = (parse(period) + PERIOD_LENGTH).strftime("%Y-%m-%d")
        period = next_period
        obs += 1

    observations = periods

    [print(i+1, obs) for i, obs in enumerate(observations)]

    return observations

observations = create_observations()

1 2021-02-01
2 2021-03-01
3 2021-04-01
4 2021-05-01


## Sampling & Nodelists

### Packages

In [5]:
def get_packages(driver, date):
    with driver.session(database='main') as session:
        query = """
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
                WITH pr, COUNT(r) AS num_pkgs
                WHERE num_pkgs = 1
                WITH pr
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr)
                WHERE pa.created < DateTime($date)
                RETURN pa.name AS name,
                       pa.repo_name AS repo_name,
                       pa.repo_owner AS repo_owner,
                       toString(pa.created) AS created
                """
        
        results = session.run(query, date=date).data()
    
        packages = pd.DataFrame.from_dict(results)
        packages['created'] = pd.to_datetime(packages['created'])
        packages['observation'] = pd.to_datetime(date)
    
    return packages

In [6]:
def get_repository_activity(repository, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (c:Comment)-[*2]->(r:Repository)
            WHERE r.name = toString($repository)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            RETURN count(c)
            """
        
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(a)
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(a)
            """

        query_commits = """
            MATCH (u:User)-[a]->(c:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(c)
            """

        results_comments = session.run(
            query_comments, repository=repository, start=start, end=end).single().value()
        results_issues = session.run(
            query_issues, repository=repository, start=start, end=end).single().value()
        results_pullreq = session.run(
            query_pullreq, repository=repository, start=start, end=end).single().value()
        results_commits = session.run(
            query_commits, repository=repository, start=start, end=end).single().value()

        activity_count = results_comments + results_issues + results_pullreq + results_commits

        return activity_count

In [7]:
def get_package_sample():
    packages = get_packages(driver, OBSERVATION_START)
    
    # Get activity for each observation period
    for index, row in tqdm(packages.iterrows(), total=len(packages.index)):
        for i, obs in enumerate(observations):
            if i == 0:
                packages.loc[index, "_".join(["act", obs])] = int(get_repository_activity(row['repo_name'], OBSERVATION_START, obs))
            else:
                packages.loc[index, "_".join(["act", obs])] = int(get_repository_activity(row['repo_name'], observations[i-1], obs))

    # Filter active packages
    act_columns = ["_".join(["act", obs]) for obs in observations]
    active_packages = packages[packages[act_columns].all(axis="columns")].copy()

    sample_pkgs = active_packages.sample(n=SAMPLE_SIZE, random_state=17, ignore_index=True)
    nodelist_pkgs = ["_".join(["pkg", name]) for name in sample_pkgs['name'].tolist()]

    nodelist = pd.DataFrame(nodelist_pkgs, columns=['id'])
    nodelist.to_csv('../data/nodelists/packages.csv', index=False)

    return sample_pkgs, nodelist_pkgs
    
sample_pkgs, nodelist_pkgs = get_package_sample()

  0%|          | 0/8070 [00:00<?, ?it/s]

In [8]:
def get_latest_version(driver, package, date):
    with driver.session(database='main') as session:
        query = """
                OPTIONAL MATCH (p:Package { id: $package })-[:RELEASED]->(v:Version)
                WHERE v.created < DateTime($date)
                AND NOT v.number CONTAINS "-"
                RETURN p.name AS name,
                       v.id AS version_id,
                       v.number AS version,
                       v.license AS license,
                       toString(v.created) AS version_created
                ORDER BY v.created DESC
                LIMIT 1
                """
        return session.run(query, package=package, date=date).data()

In [9]:
def get_dependencies(driver, version_ids):
    with driver.session(database='main') as session:
        query = """
                UNWIND $versions AS version
                MATCH (v:Version { id: version })-[d:DEPENDS_ON]->(p:Package)
                RETURN v.package_id AS source,
                       p.id AS target,
                       d.requirements AS requirements,
                       toString(v.created) AS created
                """

        results = session.run(query, versions=version_ids).data()

        dependencies = pd.DataFrame.from_dict(results)
        dependencies['created'] = pd.to_datetime(dependencies['created'])

    return dependencies

In [10]:
def get_package_sample_with_dependencies():
    packages = get_packages(driver, OBSERVATION_START)
    
    # Get activity for each observation period
    for index, row in tqdm(packages.iterrows(), total=len(packages.index)):
        for i, obs in enumerate(observations):
            if i == 0:
                packages.loc[index, "_".join(["act", obs])] = int(get_repository_activity(row['repo_name'], OBSERVATION_START, obs))
            else:
                packages.loc[index, "_".join(["act", obs])] = int(get_repository_activity(row['repo_name'], observations[i-1], obs))

    # Filter active packages
    act_columns = ["_".join(["act", obs]) for obs in observations]
    active_packages = packages[packages[act_columns].all(axis="columns")].copy()

    sample_pkgs = active_packages.sample(n=SAMPLE_SIZE, random_state=17, ignore_index=True)
    
    # Get dependencies of sample
    latest_versions = []
    for i, obs in enumerate(observations):
        for pkg in tqdm(sample_pkgs['name'].tolist(), leave=False):
            latest_version = get_latest_version(driver, pkg, obs)
            try:
                latest_versions.append(latest_version[0])
            except KeyError:
                pass  # Package has no version

    versions = pd.DataFrame.from_records(latest_versions)
    versions['version_created'] = pd.to_datetime(versions['version_created'])

    sample_pkgs = sample_pkgs.merge(versions, how="left", on=['name'])
    dependencies = get_dependencies(driver, sample_pkgs['version_id'].tolist())
    deps_sources = dependencies['source'].unique().tolist()
    deps_targets = dependencies['target'].unique().tolist()
    nodelist = list(set(deps_sources + deps_targets))
    
    sample_pkgs = packages[packages['name'].isin(nodelist)].copy()
    
    nodelist_pkgs = ["_".join(["pkg", name]) for name in sample_pkgs['name'].tolist()]

    nodelist = pd.DataFrame(nodelist_pkgs, columns=['id'])
    nodelist.to_csv('../data/nodelists/packages.csv', index=False)

    return sample_pkgs, nodelist_pkgs

# NOTE Uncomment to include dependencies of packages in sample
# sample_pkgs, nodelist_pkgs = get_package_sample_with_dependencies()

### Developer

In [11]:
def get_developers(driver, repository, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE r.name = toString($repository)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """
                
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        query_commits = """
            MATCH (u:User)-[a]->(:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        results_comments = session.run(
            query_comments, repository=repository, start=start, end=end).data()
        results_issues = session.run(
            query_issues, repository=repository, start=start, end=end).data()
        results_pullreq = session.run(
            query_pullreq, repository=repository, start=start, end=end).data()
        results_commits = session.run(
            query_commits, repository=repository, start=start, end=end).data()

        comments = pd.DataFrame.from_dict(results_comments)        
        issues = pd.DataFrame.from_dict(results_issues)
        pullreqs = pd.DataFrame.from_dict(results_pullreq)
        commits = pd.DataFrame.from_dict(results_commits)

        developers = pd.concat([comments, issues, pullreqs, commits], axis=0, ignore_index=True)
        developers.drop_duplicates(subset=['login'], inplace=True)

    return developers

In [12]:
def get_developer_activity_in_repositories(developer, repositories, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE u.login = toString($developer)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(c)
            """
        
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(a)
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(a)
            """

        query_commits = """
            MATCH (u:User)-[a]->(c:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(c)
            """

        results_comments = session.run(
            query_comments, developer=developer, repositories=repositories, start=start, end=end).single().value()
        results_issues = session.run(
            query_issues, developer=developer, repositories=repositories, start=start, end=end).single().value()
        results_pullreq = session.run(
            query_pullreq, developer=developer, repositories=repositories, start=start, end=end).single().value()
        results_commits = session.run(
            query_commits, developer=developer, repositories=repositories, start=start, end=end).single().value()

        activity_count = results_comments + results_issues + results_pullreq + results_commits

        return activity_count

In [13]:
def get_developer_sample():
    repo_names = sample_pkgs['repo_name'].tolist()
    developers = []
    for repo in tqdm(repo_names):
        repo_devs = get_developers(driver, repo, OBSERVATION_START, DATE_END)
        developers.extend(repo_devs.to_dict(orient="records"))

    devs = pd.DataFrame.from_records(developers)
    devs_sample = devs.drop_duplicates(subset=['login']).copy()
    
    # Get developer activities to filter later
    for index, row in tqdm(devs_sample.iterrows(), total=len(devs_sample.index)):
        activity = get_developer_activity_in_repositories(row['login'], sample_pkgs['repo_name'].tolist(), OBSERVATION_START, DATE_END)
        devs_sample.loc[index, "activity"] = int(activity)

    # Remove developers with less then 5 participation activity
    devs_sample = devs_sample[devs_sample['activity'] >= 5].copy()

    logins_sample = devs_sample['login'].unique().tolist()
    nodelist_devs = ["_".join(["dev", name]) for name in logins_sample]

    nodelist = pd.DataFrame(nodelist_devs, columns=['id'])
    nodelist.to_csv('../data/nodelists/developers.csv', index=False)
    
    return devs_sample, nodelist_devs

sample_devs, nodelist_devs = get_developer_sample()

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/9568 [00:00<?, ?it/s]

## Networks

### Dependencies

In [14]:
def get_latest_version(driver, package, date):
    with driver.session(database='main') as session:
        query = """
                OPTIONAL MATCH (p:Package { id: $package })-[:RELEASED]->(v:Version)
                WHERE v.created < DateTime($date)
                AND NOT v.number CONTAINS "-"
                RETURN p.name AS name,
                       v.id AS version_id,
                       v.number AS version,
                       v.license AS license,
                       toString(v.created) AS version_created
                ORDER BY v.created DESC
                LIMIT 1
                """
        return session.run(query, package=package, date=date).data()

In [15]:
def get_dependencies(driver, version_ids):
    with driver.session(database='main') as session:
        query = """
                UNWIND $versions AS version
                MATCH (v:Version { id: version })-[d:DEPENDS_ON]->(p:Package)
                RETURN v.package_id AS source,
                       p.id AS target,
                       d.requirements AS requirements,
                       toString(v.created) AS created
                """

        results = session.run(query, versions=version_ids).data()

        dependencies = pd.DataFrame.from_dict(results)
        dependencies['created'] = pd.to_datetime(dependencies['created'])

    return dependencies

In [16]:
def create_dependency_networks():
    # Create dependency networks for each observation
    dependency_networks = []

    with tqdm(total=len(observations)) as pbar:
        for obs in observations:
            packages = get_packages(driver, obs)

            latest_versions = []
            for package in tqdm(packages['name'].tolist(), leave=False):
                latest_version = get_latest_version(driver, package, obs)
                try:
                    latest_versions.append(latest_version[0])
                except KeyError:
                    pass  # Package has no version

            versions = pd.DataFrame.from_records(latest_versions)
            versions['version_created'] = pd.to_datetime(versions['version_created'])

            packages = packages.merge(versions, how="left", on=['name'])
            dependencies = get_dependencies(driver, packages['version_id'].tolist())

            # Add prefix to match nodelist
            dependencies['source'] = "pkg_" + dependencies['source']
            dependencies['target'] = "pkg_" + dependencies['target']

            edgelist = list(zip(dependencies['source'], dependencies['target']))

            G = nx.DiGraph()
            G.add_nodes_from(nodelist_pkgs)
            G.add_edges_from(edgelist)

            dependency_networks.append(G)

            nx.write_edgelist(G, '../data/edgelists/dependency_network-{0}.edgelist'.format(obs), delimiter=",", data=False)
            nx.write_gpickle(G, '../data/networks/dependency_network-{0}.pkl'.format(obs))
            nx.write_gml(G, '../data/networks/dependency_network-{0}.gml'.format(obs))

            pbar.update()
        pbar.close()
    return dependency_networks

dependency_networks = create_dependency_networks()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/8085 [00:00<?, ?it/s]

  0%|          | 0/8099 [00:00<?, ?it/s]

  0%|          | 0/8108 [00:00<?, ?it/s]

  0%|          | 0/8123 [00:00<?, ?it/s]

### Affiliations

In [17]:
def get_developer_affiliations(developer, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE u.login = toString($developer)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """
                
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        query_commits = """
            MATCH (u:User)-[a]->(:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        results_comments = session.run(
            query_comments, developer=developer, start=start, end=end).data()
        results_issues = session.run(
            query_issues, developer=developer, start=start, end=end).data()
        results_pullreq = session.run(
            query_pullreq, developer=developer, start=start, end=end).data()
        results_commits = session.run(
            query_commits, developer=developer, start=start, end=end).data()

        comments = pd.DataFrame.from_dict(results_comments)        
        issues = pd.DataFrame.from_dict(results_issues)
        pullreqs = pd.DataFrame.from_dict(results_pullreq)
        commits = pd.DataFrame.from_dict(results_commits)

        repositories = pd.concat([comments, issues, pullreqs, commits], axis=0, ignore_index=True)
        repositories.drop_duplicates(subset=['repo_name'], inplace=True)

    return repositories

In [18]:
def create_affiliation_networks():
    # Create affiliation networks for each observation
    affiliation_networks = []

    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            affiliations = []
            for dev in tqdm(sample_devs['login'].tolist(), leave=False):
                if i == 0:
                    dev_affiliations = get_developer_affiliations(dev, OBSERVATION_START, obs)
                else:
                    dev_affiliations = get_developer_affiliations(dev, observations[i-1], obs)
                dev_affiliations['login'] = dev
                affiliations.extend(dev_affiliations.to_dict(orient="records"))

            affiliations = pd.DataFrame.from_records(affiliations)
            # Add package names to affiliation data
            affiliations = affiliations.merge(sample_pkgs[["repo_name", "name"]], how="left", on="repo_name")
            
            affiliations['source'] = "dev_" + affiliations['login']
            affiliations['target'] = "pkg_" + affiliations['name']
            
            # Keep edges between devs and packages that are in nodelists
            affiliations = affiliations[affiliations['source'].isin(nodelist_devs)]
            affiliations = affiliations[affiliations['target'].isin(nodelist_pkgs)]

            edgelist = list(zip(affiliations['source'], affiliations['target']))
            
            G = nx.DiGraph()
            G.add_nodes_from(nodelist_devs, bipartite=0)
            G.add_nodes_from(nodelist_pkgs, bipartite=1)
            G.add_edges_from(edgelist)

            affiliation_networks.append(G)

            nx.write_edgelist(G, '../data/edgelists/affiliation_network-{0}.edgelist'.format(obs), delimiter=",", data=False)
            nx.write_gpickle(G, '../data/networks/affiliation_network-{0}.pkl'.format(obs))
            nx.write_gml(G, '../data/networks/affiliation_network-{0}.gml'.format(obs))

            pbar.update()
        pbar.close()

    return affiliation_networks

affiliation_networks = create_affiliation_networks()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

## Composition Change

In [19]:
def create_composition_change():
    # Composition changes for dependency network
    composition = sample_devs.copy()
    composition['name'] = "dev_" + composition['login']
    composition['appearance'] = 1
    
    inactive_developers = []
    for i, item in enumerate(observations):
         # Identify developers without outdegree in first period
        devs_degrees = dict(affiliation_networks[i].out_degree(nodelist_devs))
        devs_degrees = pd.DataFrame.from_dict(devs_degrees, orient="index", columns=["out"]).reset_index()
        devs_degrees.rename(columns={"index": "name"}, inplace=True)

        if i == 0: 
            inactive_developers.append(devs_degrees[devs_degrees['out'] == 0]['name'].tolist())
        else:
            devs = devs_degrees[devs_degrees['out'] == 0]['name'].tolist()
            still_inactive_developers = [dev for dev in devs if dev in inactive_developers[i-1]]
            inactive_developers.append(still_inactive_developers)
    
    for j, developers in enumerate(inactive_developers):
        composition.loc[composition['name'].isin(developers), 'appearance'] = int(j + 2)

    arr = []
    for i, row in composition.iterrows():
        curr = [int(row['appearance']), len(observations)]
        arr.append(curr)
        
    comp_arr = np.array(arr)
    np.savetxt('../data/compositions/comp_change.txt', comp_arr, fmt='%d')

    return composition

# composition = create_composition_change()

In [20]:
def create_composition_change_leavers():
    # Composition changes for dependency network
    composition = sample_devs.copy()
    composition['name'] = "dev_" + composition['login']
    composition['joined'] = 1

    for k in range(len(observations)):
        composition["_".join(['active', str(k+1)])] = True
    
    inactive_developers = []
    for i, item in enumerate(observations):
         # Identify developers without outdegree in first period
        devs_degrees = dict(affiliation_networks[i].out_degree(nodelist_devs))
        devs_degrees = pd.DataFrame.from_dict(devs_degrees, orient="index", columns=["out"]).reset_index()
        devs_degrees.rename(columns={"index": "name"}, inplace=True)

        if i == 0: 
            inactive_developers.append(devs_degrees[devs_degrees['out'] == 0]['name'].tolist())
        else:
            devs = devs_degrees[devs_degrees['out'] == 0]['name'].tolist()
            still_inactive_developers = [dev for dev in devs if dev in inactive_developers[i-1]]
            inactive_developers.append(still_inactive_developers)
    
    for j, developers in enumerate(inactive_developers):
        composition.loc[composition['name'].isin(developers), 'joined'] = int(j + 2)
        composition.loc[composition['name'].isin(developers), "_".join(['active', str(j+1)])] = False

    arr = []
    for i, row in composition.iterrows():
        for l in range(len(observations)):
            if row["_".join(['active', str(l+1)])]:
               left = int(l + 1)
        curr = [int(row['joined']), left]
        arr.append(curr)
        
    comp_arr = np.array(arr)
    np.savetxt('../data/compositions/comp_change.txt', comp_arr, fmt='%d')

    return composition

composition = create_composition_change_leavers()

## Dependent Networks

In [21]:
# Adjacency list for SIENA (Two-Mode Network)
def create_adjacency_matrix_bipartite():
    with tqdm(total=len(observations)) as pbar:

        appearances = [tuple(x) for x in composition[['login', 'joined']].to_numpy()]

        for i, obs in enumerate(observations):
            adj = nx.bipartite.biadjacency_matrix(affiliation_networks[i], row_order=nodelist_devs, column_order=nodelist_pkgs)
            am = adj.toarray()
            am = am.astype(str)
            
            # Change rows for developers not joined yet to NA
            for j, row in enumerate(am):
                if appearances[j][1] > int(i+1):
                    am[j] = 'NA'

            np.savetxt('../data/adjacency/net-{0}.txt'.format(obs), am, fmt='%s')
            pbar.update()
        pbar.close()

create_adjacency_matrix_bipartite()

  0%|          | 0/4 [00:00<?, ?it/s]

In [22]:
# Adjacency list for SIENA (One-Mode Network)
def create_adjacency_matrix():
    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            G = dependency_networks[i].subgraph(nodelist_pkgs)
            adj = nx.to_pandas_adjacency(G, nodelist=nodelist_pkgs)
            adj = adj.astype('Int8')
            adj = adj.astype(str)
            am = adj.to_numpy()
            np.savetxt('../data/adjacency/dnet-{0}.txt'.format(obs), am, fmt='%s')
            pbar.update()
        pbar.close()

create_adjacency_matrix()

  0%|          | 0/4 [00:00<?, ?it/s]

## Individual Variables

### Developers

#### Activity

In [23]:
def get_developer_activity(developer, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)
            WHERE u.login = toString($developer)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            RETURN count(c)
            """
        
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(a)
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(a)
            """

        query_commits = """
            MATCH (u:User)-[a]->(c:Commit)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(c)
            """

        results_comments = session.run(
            query_comments, developer=developer, start=start, end=end).single().value()
        results_issues = session.run(
            query_issues, developer=developer, start=start, end=end).single().value()
        results_pullreq = session.run(
            query_pullreq, developer=developer, start=start, end=end).single().value()
        results_commits = session.run(
            query_commits, developer=developer, start=start, end=end).single().value()

        activity_count = results_comments + results_issues + results_pullreq + results_commits

        return activity_count

In [24]:
def create_developer_activity():
    activities = sample_devs.copy()

    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            for index, row in tqdm(activities.iterrows(), total=len(activities.index), leave=False):
                if i == 0:
                    activity = get_developer_activity(row['login'], OBSERVATION_START, obs)
                else:
                    activity = get_developer_activity(row['login'], observations[i-1], obs)
                activities.loc[index, obs] = int(activity)

            pbar.update()
        pbar.close()
    
    activities[observations] = activities[observations].astype("Int64")
    activities[observations] = activities[observations].astype(str)
    activities.replace(to_replace='<NA>', value='NA', inplace=True)
    np.savetxt('../data/individual/dev_activity.txt', activities[observations].values, fmt='%s')

create_developer_activity()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

  0%|          | 0/1172 [00:00<?, ?it/s]

### Packages

#### Dependencies

In [25]:
def create_package_dependencies():
    dependencies = sample_pkgs[['name', 'repo_name']].copy()
    dependencies['name'] = "pkg_" + dependencies['name']

    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            idegree = dict(dependency_networks[i].in_degree())
            odegree = dict(dependency_networks[i].out_degree())
            
            df_idegree = pd.DataFrame.from_dict(idegree, orient="index", columns=["_".join(["in", obs])]).reset_index()
            df_idegree.rename(columns={"index": "name"}, inplace=True)

            df_odegree = pd.DataFrame.from_dict(odegree, orient="index", columns=["_".join(["out", obs])]).reset_index()
            df_odegree.rename(columns={"index": "name"}, inplace=True)

            dependencies = dependencies.merge(df_idegree, how="left", on="name")
            dependencies = dependencies.merge(df_odegree, how="left", on="name")

            pbar.update()
        pbar.close()
        
    columns_in = ["_".join(["in", obs]) for obs in observations]
    columns_out = ["_".join(["out", obs]) for obs in observations]

    dependencies[columns_in] = dependencies[columns_in].astype("Int64")
    dependencies[columns_in] = dependencies[columns_in].astype(str)

    dependencies[columns_out] = dependencies[columns_out].astype("Int64")
    dependencies[columns_out] = dependencies[columns_out].astype(str)

    dependencies.replace(to_replace='<NA>', value='NA', inplace=True)

    np.savetxt('../data/individual/pkg_upstream.txt', dependencies[columns_out].values, fmt='%s')
    np.savetxt('../data/individual/pkg_downstream.txt', dependencies[columns_in].values, fmt='%s')
    
create_package_dependencies()

  0%|          | 0/4 [00:00<?, ?it/s]

#### Age

In [26]:
def create_package_ages():
    ages = sample_pkgs[['name', 'created']].copy()
    ages['name'] = "pkg_" + ages['name']

    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            ages[obs] = (pd.to_datetime(obs, utc=True) - ages['created']) / np.timedelta64(1, 'M')
            pbar.update()
        pbar.close()

    ages[observations] = ages[observations].astype(float)
    ages[observations] = ages[observations].astype(str)

    ages.replace(to_replace='<NA>', value='NA', inplace=True)
    np.savetxt('../data/individual/pkg_age.txt', ages[observations].values, fmt='%s')

create_package_ages()

  0%|          | 0/4 [00:00<?, ?it/s]

#### Release Activity

In [27]:
def get_releases(package, start, end):
    with driver.session(database='main') as session:
        query = """
                MATCH (p:Package)-[:RELEASED]->(v:Version)
                WHERE p.name = toString($package)
                AND datetime($start) <= v.created < datetime($end)
                RETURN COUNT(v)
                """

        return session.run(query, package=package, start=start, end=end).single().value()

In [28]:
def create_release_activity():
    releases = sample_pkgs[['name']].copy()
    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            for index, row in tqdm(releases.iterrows(), total=len(releases.index), leave=False):
                if i == 0:
                    versions = get_releases(row['name'], OBSERVATION_START, obs)
                else:
                    versions = get_releases(row['name'], observations[i-1], obs)
                    
                releases.loc[index, obs] = int(versions)
            pbar.update()
        pbar.close()
    
    releases[observations] = releases[observations].astype("Int8")
    releases[observations] = releases[observations].astype(str)
    releases.replace(to_replace='<NA>', value='NA', inplace=True)
    np.savetxt('../data/individual/pkg_releases.txt', releases[observations].values, fmt='%s')

create_release_activity()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

#### Community Interest

In [29]:
def get_added_watchers(repo, start, end):
    with driver.session(database='main') as session:
        query = """
                MATCH (r:Repository)<-[w:WATCHES]-(u:User)
                WHERE r.name = toString($repo)
                AND datetime($start) < w.created <= datetime($end)
                RETURN DISTINCT u
                """

        result = session.run(query, repo=repo, start=start, end=end).value()
        result_dict = [dict(_) for _ in result]
        return len(result_dict)

In [30]:
def create_community_interest():
    stars = sample_pkgs[['name', 'repo_name']].copy()
    with tqdm(total=len(observations)) as pbar:
        for i, obs in enumerate(observations):
            for index, row in tqdm(stars.iterrows(), total=len(stars.index), leave=False):
                if i == 0:
                    watchers = get_added_watchers(row['repo_name'], OBSERVATION_START, obs)
                else:
                    watchers = get_added_watchers(row['repo_name'], observations[i-1], obs)
                    
                stars.loc[index, obs] = int(watchers)

            pbar.update()
        pbar.close()

    stars[observations] = stars[observations].astype(float)
    stars[observations] = stars[observations].astype(str)
    stars.replace(to_replace='<NA>', value='NA', inplace=True)

    np.savetxt('../data/individual/pkg_community.txt', stars[observations].values, fmt='%s')
    
create_community_interest()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

#### Licenses

In [100]:
def get_package_license(name):
    with driver.session(database="main") as session:
        query = """
            MATCH (p:Package)
            WHERE p.id = toString($name)
            RETURN toString(p.license)
            """
        return session.run(query, name=name).single().value()

In [101]:
def create_licenses():
    licenses = sample_pkgs[['name']].copy()
    # licenses['name'] = "pkg_" + licenses['name']

    for index, row in tqdm(licenses.iterrows(), total=len(licenses.index)):
        pkg_license = get_package_license(row['name'])
        if pkg_license != "None":
            licenses.loc[index, "license"] = pkg_license
        else:
            latest_version = get_latest_version(driver, row['name'], observations[-1])
            if latest_version:
                licenses.loc[index, "license"] = latest_version[0]['license']
            else:
                licenses.loc[index, "license"] = None

    # Define license categories
    unrestrict = ["Apache", "BSD", "CC-BY", "MIT", "ISC", "WTFPL", "zlib",
                  "Public", "CC0", "Unlicense", "Python", "W3C"]
    restrict = ["LGPL", "MPL", "EUPL", "Unicode", "ODC", "SGI"]
    hrestrict = ["GPL", "gpl", "Artistic"]

    licenses.loc[licenses['license'].isnull(), "license_cat"] = None
    licenses.loc[licenses['license'].str.contains('|'.join(unrestrict), na=False), "license_cat"] = 1
    licenses.loc[licenses['license'].str.contains('|'.join(restrict), na=False), "license_cat"] = 2
    licenses.loc[licenses['license'].str.contains('|'.join(hrestrict), na=False), "license_cat"] = 3

    licenses['license_cat'] = licenses['license_cat'].astype("Int8")
    licenses['license_cat'] = licenses['license_cat'].astype(str)
    licenses['license_cat'].replace(to_replace='<NA>', value='NA', inplace=True)

    np.savetxt('../data/individual/pkg_license.txt', licenses['license_cat'].values, fmt='%s')

create_licenses()

  0%|          | 0/250 [00:00<?, ?it/s]

## Dyadic Variables

#### Developer Participated in Dependency

In [34]:
def has_association(developer, package, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
            WHERE pa.name = toString($package)
            WITH pr.id AS repo_name
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE u.login = toString($developer)
            AND r.name = repo_name
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            RETURN count(c) > 0 AS association
            """
                
        query_issues = """
            MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
            WHERE pa.name = toString($package)
            WITH pr.id AS repo_name
            MATCH (u:User)-[a]->(i:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND r.name = repo_name
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(i) > 0 AS association
            """

        query_pullreq = """
            MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
            WHERE pa.name = toString($package)
            WITH pr.id AS repo_name
            MATCH (u:User)-[a]->(p:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND r.name = repo_name
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(p) > 0 AS association
            """

        query_commits = """
            MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
            WHERE pa.name = toString($package)
            WITH pr.id AS repo_name
            MATCH (u:User)-[a]->(c:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND r.name = repo_name
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN count(c) > 0 AS association
            """

        results_comments = session.run(
            query_comments, developer=developer, package=package, start=start, end=end).single().value()
        results_issues = session.run(
            query_issues, developer=developer, package=package, start=start, end=end).single().value()
        results_pullreq = session.run(
            query_pullreq, developer=developer, package=package, start=start, end=end).single().value()
        results_commits = session.run(
            query_commits, developer=developer, package=package, start=start, end=end).single().value()

    return any([results_comments, results_issues, results_pullreq, results_commits])

In [35]:
def create_dependencies_associations():
    for i, an in tqdm(enumerate(affiliation_networks), total=len(affiliation_networks)):
        G = an.copy()
        
        if i == 0:
            start = OBSERVATION_START
        else:
            start = observations[i-1]
        end = observations[i]
        
        for edge in tqdm(list(G.edges()), leave=False):
            # Get dependencies for package
            upstream = [dep[1] for dep in dependency_networks[i].out_edges(edge[1])]
            downstream = [dep[0] for dep in dependency_networks[i].in_edges(edge[1])]

            # Check if developer is associated with any of the dependencies
            if upstream:
                for udep in upstream:
                    if has_association(edge[0][4:], udep[4:], start, end):
                        G.edges[edge]['upstream'] = 1
                    else:
                        G.edges[edge]['upstream'] = 0
            else:
                G.edges[edge]['upstream'] = 0
            
            if downstream:
                for ddep in downstream:
                    if has_association(edge[0][4:], ddep[4:], start, end):
                        G.edges[edge]['downstream'] = 1
                    else:
                        G.edges[edge]['downstream'] = 0
            else:
                G.edges[edge]['downstream'] = 0

        
        adj_up = nx.bipartite.biadjacency_matrix(G, row_order=nodelist_devs, column_order=nodelist_pkgs, weight="upstream")
        am_up = adj_up.toarray()
        np.savetxt('../data/dyadic/dep_up_associations-{0}.txt'.format(observations[i]), am_up, fmt='%s')

        adj_down = nx.bipartite.biadjacency_matrix(G, row_order=nodelist_devs, column_order=nodelist_pkgs, weight="downstream")
        am_down = adj_down.toarray()
        np.savetxt('../data/dyadic/dep_down_associations-{0}.txt'.format(observations[i]), am_down, fmt='%s')
        

create_dependencies_associations()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/758 [00:00<?, ?it/s]

  0%|          | 0/807 [00:00<?, ?it/s]

  0%|          | 0/788 [00:00<?, ?it/s]

  0%|          | 0/758 [00:00<?, ?it/s]