# HICSS 2023: Data Transformation

This notebook performs data transformation operations by collecting data from the neo4j database.
It then transforms the data into the required format for later usage in RSiena.

The general database schema looks like this:

![Database Schema](../figures/db_schema.png "Database Schema")


## TODOs

[ ] Check Database; somehow there are PullRequests related to Projects, Issues and Pull Requests related to themselves?!

## Imports

In [1]:
import os
import datetime
import pytz
import pickle
import numpy as np
import pandas as pd
import networkx as nx
from dotenv import find_dotenv, load_dotenv
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
from neo4j import GraphDatabase
from tqdm.notebook import tqdm

## Required Functions

This section definies various function for data retrieval from Neo4j database.

In [2]:
def get_packages(driver, date):
    """Queries packages from database

    Retrieves a list of packages that have a unique repository
    and have been created until the specified date.

    Args:
        driver: Neo4j database connection's driver.
        date: Date to filter packages creation.
    
    Returns:
        A pandas DataFrame containing the packages.
        Each row represents a package with the attributes
        'name', 'repo_name', 'repo_owner', and 'created'.
    """
    with driver.session(database='main') as session:
        query = """
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
                WITH pr, COUNT(r) AS num_pkgs
                WHERE num_pkgs = 1
                WITH pr
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr)
                WHERE pa.created < DateTime($date)
                RETURN pa.name AS name,
                       pa.repo_name AS repo_name,
                       pa.repo_owner AS repo_owner,
                       toString(pa.created) AS created
                """
        
        results = session.run(query, date=date).data()
    
        packages = pd.DataFrame.from_dict(results)
        packages['created'] = pd.to_datetime(packages['created'])
        packages['observation'] = pd.to_datetime(date)
    
    return packages

In [80]:
def get_packages_by_repos(driver, repos):
    with driver.session(database='main') as session:
        query = """
                MATCH (pa:Package)-[r:DEVELOPED_AT]->(pr:Project)
                WHERE pr.id IN $repos
                RETURN pa.name AS name,
                       pa.repo_name AS repo_name,
                       pa.repo_owner AS repo_owner,
                       toString(pa.created) AS created
                """
        
        results = session.run(query, repos=repos).data()
    
        return pd.DataFrame.from_dict(results)

In [3]:
def get_latest_version(driver, package, date):
    """Queries latest package version from database

    Retrieves the latest version for each package in list of
    names at observation date.

    Args:
        driver: Neo4j database connection's driver.
        package: Package name.
        date: Date of observation.
    
    Returns:
        A list of dictionaries with data.
    """
    with driver.session(database='main') as session:
        query = """
                OPTIONAL MATCH (p:Package { id: $package })-[:RELEASED]->(v:Version)
                WHERE v.created < DateTime($date)
                AND NOT v.number CONTAINS "-"
                RETURN p.name AS name,
                       v.id AS version_id,
                       v.number AS version,
                       v.license AS license,
                       toString(v.created) AS version_created
                ORDER BY v.created DESC
                LIMIT 1
                """
        return session.run(query, package=package, date=date).data()

In [4]:
def get_dependencies(driver, version_ids):
    """Queries version's dependencies from database.

    Retrieves the dependencies of for each package version
    in list of version IDs.

    Args:
        driver: Neo4j database connection's driver.
        version_ids: List of version IDs.
    
    Returns:
        A pandas DataFrame containing the all dependencies for
        each version ID. Each row represents an edge from the
        package towards its dependency with the attributes
        'source', 'target', 'type', and 'created.
    """
    with driver.session(database='main') as session:
        query = """
                UNWIND $versions AS version
                MATCH (v:Version { id: version })-[d:DEPENDS_ON]->(p:Package)
                RETURN v.package_id AS source,
                       p.id AS target,
                       d.requirements AS requirements,
                       toString(v.created) AS created
                """

        results = session.run(query, versions=version_ids).data()

        dependencies = pd.DataFrame.from_dict(results)
        dependencies['created'] = pd.to_datetime(dependencies['created'])

    return dependencies

In [5]:
def get_developers(driver, repository, start, end):
    """Queries participating developers for repositories from database.

    Retrieves the participating of for each repositories
    in list of IDs.

    Args:
        driver: Neo4j database connection's driver.
        repository: Repository IDs.
    
    Returns:
        A pandas DataFrame containing the participating users for
        a repository.
    """
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE r.name = toString($repository)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """
                
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        query_commits = """
            MATCH (u:User)-[a]->(:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE r.name = toString($repository)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND u.type <> "Bot"
            AND NOT u.login CONTAINS "[bot]"
            RETURN DISTINCT u.login AS login
            """

        results_comments = session.run(
            query_comments, repository=repository, start=start, end=end).data()
        results_issues = session.run(
            query_issues, repository=repository, start=start, end=end).data()
        results_pullreq = session.run(
            query_pullreq, repository=repository, start=start, end=end).data()
        results_commits = session.run(
            query_commits, repository=repository, start=start, end=end).data()

        comments = pd.DataFrame.from_dict(results_comments)        
        issues = pd.DataFrame.from_dict(results_issues)
        pullreqs = pd.DataFrame.from_dict(results_pullreq)
        commits = pd.DataFrame.from_dict(results_commits)

        developers = pd.concat([comments, issues, pullreqs, commits], axis=0, ignore_index=True)
        developers.drop_duplicates(subset=['login'], inplace=True)

    return developers

In [6]:
def get_developer_activity(driver, developer, repositories, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE u.login = toString($developer)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(c)
            """
        
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(a)
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(a)
            """

        query_commits = """
            MATCH (u:User)-[a]->(c:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            AND r.name IN $repositories
            RETURN count(c)
            """

        results_comments = session.run(
            query_comments, developer=developer, repositories=repositories, start=start, end=end).single().value()
        results_issues = session.run(
            query_issues, developer=developer, repositories=repositories, start=start, end=end).single().value()
        results_pullreq = session.run(
            query_pullreq, developer=developer, repositories=repositories, start=start, end=end).single().value()
        results_commits = session.run(
            query_commits, developer=developer, repositories=repositories, start=start, end=end).single().value()

        activity_count = results_comments + results_issues + results_pullreq + results_commits

        return activity_count

In [60]:
def get_developer_targets(driver, developer, start, end):
    with driver.session(database='main') as session:
        query_comments = """
            MATCH (u:User)-[p:POSTED]->(c:Comment)-[*2]->(r:Repository)
            WHERE u.login = toString($developer)
            AND c.created >= datetime($start)
            AND c.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """
                
        query_issues = """
            MATCH (u:User)-[a]->(:Issue)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        query_pullreq = """
            MATCH (u:User)-[a]->(:PullRequest)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        query_commits = """
            MATCH (u:User)-[a]->(:Commit)-[:RELATED_TO]->(r:Repository)
            WHERE u.login = toString($developer)
            AND a.created >= datetime($start)
            AND a.created < datetime($end)
            RETURN DISTINCT r.name AS repo_name
            """

        results_comments = session.run(
            query_comments, developer=developer, start=start, end=end).data()
        results_issues = session.run(
            query_issues, developer=developer, start=start, end=end).data()
        results_pullreq = session.run(
            query_pullreq, developer=developer, start=start, end=end).data()
        results_commits = session.run(
            query_commits, developer=developer, start=start, end=end).data()

        comments = pd.DataFrame.from_dict(results_comments)        
        issues = pd.DataFrame.from_dict(results_issues)
        pullreqs = pd.DataFrame.from_dict(results_pullreq)
        commits = pd.DataFrame.from_dict(results_commits)

        repositories = pd.concat([comments, issues, pullreqs, commits], axis=0, ignore_index=True)
        repositories.drop_duplicates(subset=['repo_name'], inplace=True)

    return repositories

In [7]:
def get_added_watchers(driver, repo, start, end):
    with driver.session(database='main') as session:
        query = """
                MATCH (r:Repository)<-[w:WATCHES]-(u:User)
                WHERE r.name = toString($repo)
                AND datetime($start) < w.created <= datetime($end)
                RETURN DISTINCT u
                """

        result = session.run(query, repo=repo, start=start, end=end).value()
        result_dict = [dict(_) for _ in result]
        return len(result_dict)


## Database Connection

In [8]:
load_dotenv(find_dotenv())

# Get env variables
uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

driver = GraphDatabase.driver(uri, auth=(user, password),
                              encrypted=False,
                              max_connection_lifetime=3600)

## Observations

In [11]:
# Set periods for iteration
OBSERVATION_START = '2021-01-01'  # First date of observation period
DATE_START = '2021-02-01'  # Date of the first snapshot
DATE_END = '2022-01-01'  # Date of the last snapshot
PERIOD_LENGTH = relativedelta(months=1)  # Time between snapshots

period = DATE_START
periods = []
obs = 0
while period <= DATE_END:
    periods.append(period)
    next_period = (parse(period) + PERIOD_LENGTH).strftime("%Y-%m-%d")
    period = next_period
    obs += 1

observations = periods

[print(i+1, obs) for i, obs in enumerate(observations)];

1 2021-02-01
2 2021-03-01
3 2021-04-01
4 2021-05-01
5 2021-06-01
6 2021-07-01
7 2021-08-01
8 2021-09-01
9 2021-10-01
10 2021-11-01
11 2021-12-01
12 2022-01-01


## Data Retrieval

In [50]:
# Sampling the other way by developers
packages = get_packages(driver, observations[0])
repo_names = packages['repo_name'].tolist()
developers = []
for repo in tqdm(repo_names):
    devs = get_developers(driver, repo, OBSERVATION_START, DATE_END)
    developers.extend(devs.to_dict(orient="records"))

dev_df = pd.DataFrame.from_records(developers)
devs_sample = dev_df.drop_duplicates(subset=['login'])
devs_sample = devs_sample.sample(n=500, random_state=17, ignore_index=True)

logins = devs_sample['login'].unique().tolist()
nodelist_devs = ["_".join(["dev", name]) for name in logins]

df_nodes_devs = pd.DataFrame(nodelist_devs, columns=['id'])
df_nodes_devs.to_csv('../data/nodelists/developers.csv', index=False)

  0%|          | 0/8085 [00:00<?, ?it/s]

In [85]:
all_repos = []
for dev in tqdm(logins):
    repos = get_developer_targets(driver, dev, OBSERVATION_START, DATE_END)
    all_repos.extend(repos['repo_name'].tolist())

all_repos = list(set(all_repos))

packages = get_packages(driver, DATE_END)
selected_repos = [repo for repo in all_repos if repo in packages['repo_name'].tolist()]
selected_packages = get_packages_by_repos(driver, selected_repos)
nodelist_pkgs = ["_".join(["pkg", name]) for name in selected_packages['name'].tolist()]

  0%|          | 0/500 [00:00<?, ?it/s]

### Lists

#### Packages

In [13]:
packages = []

pkgs_sample = get_packages(driver, observations[0])
pkgs_sample = pkgs_sample.sample(n=100, random_state=17, ignore_index=True)

for obs in tqdm(observations):
    # _packages = get_packages(driver, obs)

    latest_versions = []
    for package in tqdm(pkgs_sample['name'].tolist(), leave=False):
        latest_version = get_latest_version(driver, package, obs)
        latest_versions.append(latest_version[0])

    versions = pd.DataFrame.from_records(latest_versions)
    versions['version_created'] = pd.to_datetime(versions['version_created'])

    _packages = pkgs_sample.merge(versions, how="left", on=['name'])
    _packages.to_csv(f'../data/lists/packages-{obs}.csv', index=False)
    packages.append(_packages)

# Latest observation makes nodelist to account for later joiners
full_packages = packages[(len(observations) - 1)]
nodelist_pkgs = pkgs_sample['name'].tolist()
nodelist_pkgs = ["_".join(["pkg", name]) for name in nodelist_pkgs]

df_nodes_pkgs = pd.DataFrame(nodelist_pkgs, columns=['id'])
df_nodes_pkgs.to_csv('../data/nodelists/packages.csv', index=False)


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

#### Developers

In [14]:
developers_lists = []
repo_names = pkgs_sample['repo_name'].tolist()

with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        _developers = []
        for repo in tqdm(repo_names, leave=False):
            if i == 0:
                devs = get_developers(driver, repo, OBSERVATION_START, obs)
            else:
                devs = get_developers(driver, repo, observations[i-1], observations[i])
            _developers.extend(devs.to_dict(orient="records"))
        dev_df = pd.DataFrame.from_records(_developers)
        dev_df.to_csv(f'../data/lists/developers-{obs}.csv', index=False)
        devs = dev_df['login'].unique().tolist()
        developers_lists.append(devs)

        pbar.update()
    pbar.close()

# Create list of all developers participating in the observation period
nodelist_devs = set()
for i in range(len(observations)):
    nodelist_devs.update(developers_lists[i])
nodelist_devs = list(nodelist_devs)
nodelist_devs = ["_".join(["dev", name]) for name in nodelist_devs]

df_nodes_devs = pd.DataFrame(nodelist_devs, columns=['id'])
df_nodes_devs.to_csv('../data/nodelists/developers.csv', index=False)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

### Networks

#### Dependency Network

In [15]:
# Create dependency networks for each observation
dependency_networks = []
with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        packages_full = get_packages(driver, obs)

        latest_versions = []
        for package in tqdm(packages_full['name'].tolist(), leave=False):
            latest_version = get_latest_version(driver, package, obs)
            try:
                latest_versions.append(latest_version[0])
            except KeyError:
                pass  # Package has no version

        versions = pd.DataFrame.from_records(latest_versions)
        versions['version_created'] = pd.to_datetime(versions['version_created'])

        packages_obs = packages_full.merge(versions, how="left", on=['name'])
        dependencies = get_dependencies(driver, packages_obs['version_id'].tolist())

        # Keep edges between nodes in nodelist
        # dependencies = dependencies[dependencies['target'].isin(packages_obs[i]['name'].tolist())]

        # Add prefix to match nodelist
        dependencies['source'] = "pkg_" + dependencies['source']
        dependencies['target'] = "pkg_" + dependencies['target']

        edgelist = list(zip(dependencies['source'], dependencies['target']))

        G = nx.DiGraph()
        G.add_nodes_from(nodelist_pkgs)
        G.add_edges_from(edgelist)
        dependency_networks.append(G)
        nx.write_edgelist(G, '../data/edgelists/dependency_network-{0}.edgelist'.format(obs), delimiter=",", data=False)
        nx.write_gpickle(G, '../data/networks/dependency_network-{0}.pkl'.format(obs))
        nx.write_gml(G, '../data/networks/dependency_network-{0}.gml'.format(obs))
        pbar.update()
    pbar.close()

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/8085 [00:00<?, ?it/s]

  0%|          | 0/8099 [00:00<?, ?it/s]

  0%|          | 0/8108 [00:00<?, ?it/s]

  0%|          | 0/8123 [00:00<?, ?it/s]

  0%|          | 0/8134 [00:00<?, ?it/s]

  0%|          | 0/8141 [00:00<?, ?it/s]

  0%|          | 0/8155 [00:00<?, ?it/s]

  0%|          | 0/8171 [00:00<?, ?it/s]

  0%|          | 0/8180 [00:00<?, ?it/s]

  0%|          | 0/8194 [00:00<?, ?it/s]

  0%|          | 0/8201 [00:00<?, ?it/s]

  0%|          | 0/8208 [00:00<?, ?it/s]

##### Adjacency Matrices

In [16]:
with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        adj = nx.to_pandas_adjacency(dependency_networks[i], nodelist=nodelist_pkgs)

        # Identify missing nodes at observation
        available_nodes = list(full_packages[full_packages['created'] < obs]['name'])
        available_nodes = ["_".join(["pkg", name]) for name in available_nodes]
        missing_nodes = [item for item in nodelist_pkgs if item not in available_nodes]

        # Change rows
        adj.loc[ missing_nodes , : ] = np.nan
        # Change columns
        adj.loc[ : , missing_nodes ] = np.nan

        adj = adj.astype('Int8')
        adj = adj.astype(str)
        adj.replace(to_replace='<NA>', value='NA', inplace=True)

        am = adj.to_numpy()
        np.savetxt('../data/adjacency/dnet-{0}.txt'.format(obs), am, fmt='%s')

        # With structural zeros instead of NA for Goodness-of-Fit tests
        adj.replace(to_replace='NA', value='0', inplace=True)
        am_gof = adj.to_numpy()
        np.savetxt('../data/adjacency/dnet-{0}-gof.txt'.format(i+1), am_gof, fmt='%s')

        pbar.update()
    pbar.close()

  0%|          | 0/12 [00:00<?, ?it/s]

##### Composition Change

In [193]:
# Composition changes for dependency network
composition = packages[len(observations) - 1][['name', 'created']].copy()
composition['name'] = "pkg_" + composition['name']
composition['appearance'] = 0

rev_obs = sorted(observations,  reverse=True)
for i, item in enumerate(rev_obs):
    obs = len(rev_obs) - i
    composition.loc[composition['created'] < item, 'appearance'] = int(obs)

arr = []
for i, row in composition.iterrows():
    curr = [int(row['appearance']), len(observations)]
    arr.append(curr)
    
comp_arr = np.array(arr)
np.savetxt('../data/compositions/dependency_network.txt', comp_arr, fmt='%d')

#### Affiliation Networks Sampled by Devs

In [None]:
# Create affiliation networks for each observation
affiliation_networks = []

with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        data = []
        for dev in devs_sample['login'].tolist():
            if i == 0:
                target_repos = get_developer_targets(driver, dev, OBSERVATION_START, obs)
            else:
                target_repos = get_developer_targets(driver, dev, observations[i-1], obs)
                
            target_repos['login'] = dev
            data.extend(target_repos.to_dict(orient="records"))


        df_data = pd.DataFrame.from_records(data)
        df_data['source'] = "dev_" + df_data['login']
        df_data['target'] = "pkg_" + df_data['repo_name']
        edgelist = list(zip(df_devs['source'], df_devs['target']))
        G = nx.DiGraph()
        G.add_nodes_from(nodelist_devs, bipartite=0)
        G.add_nodes_from(nodelist_pkgs, bipartite=1)
        G.add_edges_from(edgelist)
        affiliation_networks.append(G)
        nx.write_edgelist(G, '../data/edgelists/affiliation_network-{0}.edgelist'.format(obs), delimiter=",", data=False)
        nx.write_gpickle(G, '../data/networks/affiliation_network-{0}.pkl'.format(obs))
        nx.write_gml(G, '../data/networks/affiliation_network-{0}.gml'.format(obs))
        pbar.update()
    pbar.close()

#### Affiliation Networks

In [17]:
# Create affiliation networks for each observation
affiliation_networks = []

pkgs = pkgs_sample['name'].tolist()
repos = pkgs_sample['repo_name'].tolist()
packages_repos = list(zip(pkgs, repos))

with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        developers = []
        for pkg, repo in tqdm(packages_repos, leave=False):
            if i == 0:
                devs = get_developers(driver, repo, OBSERVATION_START, obs)
            else:
                devs = get_developers(driver, repo, observations[i-1], observations[i])
            
            devs['package'] = pkg
            
            developers.extend(devs.to_dict(orient="records"))

        df_devs = pd.DataFrame.from_records(developers)
        df_devs['source'] = "dev_" + df_devs['login']
        df_devs['target'] = "pkg_" + df_devs['package']
        edgelist = list(zip(df_devs['source'], df_devs['target']))

        G = nx.DiGraph()
        G.add_nodes_from(nodelist_devs, bipartite=0)
        G.add_nodes_from(nodelist_pkgs, bipartite=1)
        G.add_edges_from(edgelist)
        affiliation_networks.append(G)
        nx.write_edgelist(G, '../data/edgelists/affiliation_network-{0}.edgelist'.format(obs), delimiter=",", data=False)
        nx.write_gpickle(G, '../data/networks/affiliation_network-{0}.pkl'.format(obs))
        nx.write_gml(G, '../data/networks/affiliation_network-{0}.gml'.format(obs))
        pbar.update()
    pbar.close()

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

##### Adjacency Matrices

In [18]:
with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        adj = nx.bipartite.biadjacency_matrix(affiliation_networks[i], row_order=nodelist_devs, column_order=nodelist_pkgs)

        # Identify missing nodes at observation
        # available_packages = list(full_packages[full_packages['created'] < obs]['name'])
        # missing_packages = [item for item in nodelist_pkgs if item not in available_packages]
        # adj.loc[ : , missing_packages ] = np.nan

        # adj = adj.astype('Int8')
        # adj = adj.astype(str)
        # adj.replace(to_replace='<NA>', value='NA', inplace=True)

        am = adj.toarray()
        np.savetxt('../data/adjacency/anet-{0}.txt'.format(obs), am, fmt='%s')

        # With structural zeros instead of NA for Goodness-of-Fit tests
        # adj.replace(to_replace='NA', value='0', inplace=True)
        # am_gof = adj.to_numpy()
        # np.savetxt('../data/adjacency/anet-{0}-gof.txt'.format(obs), am_gof, fmt='%s')

        pbar.update()
    pbar.close()

  0%|          | 0/12 [00:00<?, ?it/s]

##### Composition Change

In [None]:
# Composition changes for affiliation network
composition = developers[len(observations) - 1][['name', 'created']].copy()
composition['name'] = "pkg_" + composition['name']
composition['appearance'] = 0

rev_obs = sorted(observations,  reverse=True)
for i, item in enumerate(rev_obs):
    obs = len(rev_obs) - i
    composition.loc[composition['created'] < item, 'appearance'] = int(obs)

arr = []
for i, row in composition.iterrows():
    curr = [int(row['appearance']), len(observations)]
    arr.append(curr)
    
comp_arr = np.array(arr)
np.savetxt('../data/compositions/dependency_network.txt', comp_arr, fmt='%d')

**TODOs**

- Dyadic Variables:
    - User has dependency with project
    - User participated before
    - User has collaborated with users related to project before
- Individual Variables:
    - Packages
        - License
        - Age
        - [X] Dependencies
        - [X] Dependents
        - Releases
        - [X] GitHub Stars
    - Developers
        - Tenure?
- Behavior Variable: 
    - [X] Developer Activity

### Behavior Variable

In [19]:
# Behavior probably needs to be set on the actor level (hence developers)
# Let's try general activity
repos = pkgs_sample['repo_name'].tolist()
activities = []
with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        activities_obs = []
        for dev in tqdm(nodelist_devs, leave=False):
            if i == 0:
                activity = get_developer_activity(driver, dev[4:], repos, OBSERVATION_START, obs)
            else:
                activity = get_developer_activity(driver, dev[4:], repos, observations[i-1], obs)
            activities_obs.append({"developer": dev, obs: activity})
        activities.append(activities_obs)
        pbar.update()
    pbar.close()
    
df = pd.DataFrame(nodelist_devs, columns=['developer'])

for i in range(len(activities)):
    _df = pd.DataFrame.from_records(activities[i])
    df = df.merge(_df, on="developer")
    
df.to_csv('../data/behavior/developer_activities.csv', index=False)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

  0%|          | 0/2047 [00:00<?, ?it/s]

In [220]:
# Transform to SIENA format
df_behavior = df.copy()
for i in range(len(observations)):
    for index, row in df_behavior.iterrows():
        if row[observations[i]] == 0:
            df_behavior.loc[index, observations[i]] = 0
        elif row[observations[i]] == 1:
            df_behavior.loc[index, observations[i]] = 1
        elif row[observations[i]] <= 5: 
            df_behavior.loc[index, observations[i]] = 2
        elif row[observations[i]] > 5:
            df_behavior.loc[index, observations[i]] = 3

df_behavior = df_behavior[observations].astype('Int8')
df_behavior = df_behavior[observations].astype(str)
df_behavior.replace(to_replace='<NA>', value='NA', inplace=True)
np.savetxt('../data/behavior/developer_activities.txt', df_behavior[observations].values, fmt='%s')

### Dyadic Variables

#### Dependencies

This chunk creates a matrix with dummy variables representing if the developer has contributed to a package that is a dependent or dependency of the focal package that it now contributed to.

In [35]:
for i, obs in enumerate(observations[:1]):
    net_aff = nx.read_gpickle('../data/networks/affiliation_network-{0}.pkl'.format(obs))
    net_dep = nx.read_gpickle('../data/networks/dependency_network-{0}.pkl'.format(obs))
    edgelist_aff = [list(item) for item in list(net_aff.edges())]
    edgelist_dep = [list(item) for item in list(net_dep.edges())]

In [37]:
edgelist_aff[:5]

[['dev_333one', 'pkg_multer'],
 ['dev_RexHung0302', 'pkg_multer'],
 ['dev_ekatrand', 'pkg_multer'],
 ['dev_sarahm7', 'pkg_yjs'],
 ['dev_ehmicky', 'pkg_figures']]

### Individual Variables

In [24]:
# TODO
# Packages:
#   - Dependencies / Dependents
#   - License
#   - Age
#   - GitHub Stars
#
# Developers:
#   - Tenure (Participated before)

#### Packages

##### Dependencies

In [20]:
dependencies = pkgs_sample[['name', 'repo_name']].copy()
dependencies['name'] = "pkg_" + dependencies['name']

with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        G = dependency_networks[i]
        idegree = dict(G.in_degree())
        odegree = dict(G.out_degree())
        
        df_idegree = pd.DataFrame.from_dict(idegree, orient="index", columns=["_".join(["in", obs])]).reset_index()
        df_idegree.rename(columns={"index": "name"}, inplace=True)

        df_odegree = pd.DataFrame.from_dict(odegree, orient="index", columns=["_".join(["out", obs])]).reset_index()
        df_odegree.rename(columns={"index": "name"}, inplace=True)

        dependencies = dependencies.merge(df_idegree, how="left", on="name")
        dependencies = dependencies.merge(df_odegree, how="left", on="name")

        pbar.update()
    pbar.close()
    
columns_in = ["_".join(["in", obs]) for obs in observations]
columns_out = ["_".join(["out", obs]) for obs in observations]

dependencies[columns_in] = dependencies[columns_in].astype("Int8")
dependencies[columns_in] = dependencies[columns_in].astype(str)

dependencies[columns_out] = dependencies[columns_out].astype("Int8")
dependencies[columns_out] = dependencies[columns_out].astype(str)

dependencies.replace(to_replace='<NA>', value='NA', inplace=True)

np.savetxt('../data/individual/pkg_upstream.txt', dependencies[columns_out].values, fmt='%s')
np.savetxt('../data/individual/pkg_downstream.txt', dependencies[columns_in].values, fmt='%s')

  0%|          | 0/12 [00:00<?, ?it/s]

##### GitHub Stars

In [21]:
stars = pkgs_sample[['name', 'repo_name']].copy()

with tqdm(total=len(observations)) as pbar:
    for i, obs in enumerate(observations):
        for index, row in tqdm(stars.iterrows(), total=len(stars.index), leave=False):
            if i == 0:
                watchers = get_added_watchers(driver, row['repo_name'], OBSERVATION_START, obs)
            else:
                watchers = get_added_watchers(driver, row['repo_name'], observations[i-1], obs)
                
            stars.loc[index, obs] = int(watchers)
        
        stars[f'log_{obs}'] = np.log10(stars[obs] + 1)
        pbar.update()
    pbar.close()

columns = ["_".join(['log', obs]) for obs in observations]
stars[columns] = stars[columns].astype(float)
stars[columns] = stars[columns].astype(str)
stars.replace(to_replace='<NA>', value='NA', inplace=True)
np.savetxt('../data/individual/pkg_stars.txt', stars[columns].values, fmt='%s')

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

##### Release Activity

In [183]:
# TODO

---

### Licenses

In [49]:
licenses = pd.DataFrame(nodelist, columns=['name'])

for i, obs in enumerate(observations):
    latest_versions = get_latest_version(driver, nodelist, obs)
    latest_licenses = get_licenses(driver, list(latest_versions['_id']))
    licenses = licenses.merge(latest_licenses, how='outer', on=['name'])
    
licenses.columns = ['name'] + observations

In [50]:
licenses.head()

Unnamed: 0,name,2016-01-01,2017-01-01,2018-01-01,2019-01-01
0,@atlaskit/build-utils,,,Apache,Apache
1,@atlaskit/button,,,Apache,Apache
2,@atlaskit/docs,,,Apache,Apache
3,@atlaskit/icon,,,,
4,@atlaskit/theme,,,Apache,Apache


In [51]:
licenses_codes = licenses.copy()

licenses_codes.replace(to_replace='NULL', value=np.nan, inplace=True)
licenses_codes[observations] = licenses_codes[observations].apply(lambda col:pd.Categorical(col).codes)

licenses_codes = licenses_codes.astype(str)
licenses_codes.replace(to_replace='-1', value='NA', inplace=True)
licenses_codes.replace(to_replace='4', value='5', inplace=True)
licenses_codes.replace(to_replace='3', value='4', inplace=True)
licenses_codes.replace(to_replace='2', value='3', inplace=True)
licenses_codes.replace(to_replace='1', value='2', inplace=True)
licenses_codes.replace(to_replace='0', value='1', inplace=True)


np.savetxt('../rsiena/data/licenses-codes.txt',
           licenses_codes[observations].values, fmt='%s')

In [55]:
licenses_codes.head()

Unnamed: 0,name,2016-01-01,2017-01-01,2018-01-01,2019-01-01
0,@atlaskit/build-utils,,,1.0,1.0
1,@atlaskit/button,,,1.0,1.0
2,@atlaskit/docs,,,1.0,1.0
3,@atlaskit/icon,,,,
4,@atlaskit/theme,,,1.0,1.0


In [56]:
releases = pd.DataFrame(nodelist, columns=['name'])

for i in range(len(observations)):
    release_counts = get_releases(driver, nodelist, observations[i])
    release_counts.columns = ['name', observations[i]]
    releases = releases.merge(release_counts, how='outer', on=['name'])

In [57]:
releases.describe()

Unnamed: 0,2016-01-01,2017-01-01,2018-01-01,2019-01-01
count,199.0,256.0,280.0,301.0
mean,34.376884,40.207031,48.346429,55.534884
std,47.611047,48.636548,53.227159,63.142402
min,1.0,1.0,1.0,1.0
25%,9.0,11.0,16.0,17.0
50%,19.0,23.0,30.0,36.0
75%,42.0,49.0,58.0,69.0
max,455.0,460.0,468.0,478.0


In [58]:
releases_codes = releases.copy()

for i in range(len(observations)):
    for index, row in releases_codes.iterrows():
        if row[observations[i]] <= 5:
            releases_codes.loc[index, observations[i]] = 1
        elif row[observations[i]] <= 10:
            releases_codes.loc[index, observations[i]] = 2
        elif row[observations[i]] <= 20:
            releases_codes.loc[index, observations[i]] = 3
        elif row[observations[i]] <= 30:
            releases_codes.loc[index, observations[i]] = 4
        elif row[observations[i]] <= 40:
            releases_codes.loc[index, observations[i]] = 5
        elif row[observations[i]] <= 50: 
            releases_codes.loc[index, observations[i]] = 6
        elif row[observations[i]] <= 60:
            releases_codes.loc[index, observations[i]] = 7
        elif row[observations[i]] > 60:
            releases_codes.loc[index, observations[i]] = 8

releases_codes[observations] = releases_codes[observations].astype('Int8')
releases_codes[observations] = releases_codes[observations].astype(str)
releases_codes.replace(to_replace='<NA>', value='NA', inplace=True)
np.savetxt('../rsiena/data/releases-codes.txt',
           releases_codes[observations].values, fmt='%s')


### Age

In [59]:
ages = df_created[['name', 'created']]

for obs in observations:
    ages[obs] = ages['created'].apply(
        lambda x: datetime.datetime.strptime(obs, '%Y-%m-%d').date() - x.date())
    ages[obs] = round(ages[obs].dt.days/365, 0)
    ages[obs] = ages[obs].apply(lambda x : x if x > 0 else np.nan)

ages[observations] = ages[observations].astype('Int8')
ages[observations] = ages[observations].astype(str)

ages.replace(to_replace='<NA>', value='NA', inplace=True)
np.savetxt('../rsiena/data/ages.txt', ages[observations].values, fmt='%s')