# How do package dependency network grow over time?

In this notebook we plan to investigate the dependencies in the Pypi dataset and try to look at
how they change over time.

## Loading data and importing libraries

In [1]:
%load_ext autoreload

# Auto reloading causes the kernel to reload the libraries we have
%autoreload 2

# usual imports for visualization, etc.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime

# make it reproducible
np.random.seed(0)

# show plots inline
%matplotlib inline

## Defining helper functions and defaults

In [2]:
# Helper Functions
def add_three_months(t):
    for i in range(1):
        t = add_a_month(t)
    return t
def add_a_month(t):
    return (t.replace(day=1) + datetime.timedelta(days=31)).replace(day=1)
def print_time(t):
    return t.strftime("%Y-%m-%dT%H:%M:%S")

!mkdir figs
figs_dir = 'figs/'

mkdir: cannot create directory ‘figs’: File exists


In [3]:
from cycler import cycler
def set_plt_rc():
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    
    font = {'family': 'serif','size': BIGGER_SIZE}
    plt.rc('font', **font)

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
        
    plt.rc('axes', prop_cycle=(cycler(color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
                                         '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
                                         '#bcbd22', '#17becf']) +
                           cycler(linestyle=['-', '--', ':', '-.','-', '--', ':', '-.','-', '--'])))
    
set_plt_rc()

# Transitive Dependencies

In [None]:

data_folder = '../data/'

requirements = pd.read_csv(data_folder + 'data_all.csv')
requirements.loc[:,'package_name'] = requirements.loc[:,'name']
requirements.loc[:,'requirement'] = requirements.loc[:,'deps']

requirements.loc[:,'package_name'] = requirements.loc[:,'package_name'].apply(lambda x: x.lower() if type(x)==str else x)
requirements.loc[:,'requirement'] = requirements.loc[:,'requirement'].apply(lambda x: x.lower() if type(x)==str else x)

requirements.loc[:,'date'] = requirements.loc[:,'date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

requirements.head()

In [None]:
from tqdm.autonotebook import tqdm

tqdm.pandas()

In [None]:
max_date = datetime.datetime.strptime('2015-01-01', '%Y-%m-%d')
p = '115wangpan'

sub = requirements.loc[requirements['name'] == p, :]
if max_date is not None:
    sub = sub[sub.loc[:,'date'] <= max_date]
date = sub['date'].max()
sub.loc[sub['date'] == date, :]

In [None]:
dep_cache = {}
MAX_LEVEL = 10

def get_date(p, max_date=None):
    sub = requirements.loc[requirements['name'] == p, :]
    if max_date is not None:
        sub = sub[sub.loc[:,'date'] <= max_date]
    date = sub.loc[:, 'date'].max()
    return date

def get_direct_deps(p, date = None):   
    if date is None:
        date = get_date(p)
    
    if (p, date) in dep_cache:
        return dep_cache[(p, date)][1]
    
    sub = requirements.loc[requirements['name'] == p, :]
    sub = sub.loc[sub['date'] == date, :]
    direct_deps = list(sub.loc[:,'requirement'])
    return direct_deps

def get_all_deps(p, max_date=None, level=1, old_deps=None):
    if old_deps is None:
        old_deps = []
    if level > MAX_LEVEL:
        return -1, -1
    
    date = get_date(p, max_date)
    
    if (p, date) in dep_cache:
        return dep_cache[(p, date)]
    
    direct_deps = get_direct_deps(p, date)
    all_deps = []
    for req in direct_deps:
        if req in old_deps:
            continue
        d, _ = get_all_deps(req, date, level+1, all_deps + old_deps)
        if d == -1:
            return list(set(all_deps)), list(set(direct_deps))
        all_deps += d
    
    all_deps += direct_deps
    
    dep_cache[(p, date)] = (list(set(all_deps)), list(set(direct_deps)))
    return list(set(all_deps)), list(set(direct_deps))

def get_transitive_deps(p, max_date=None):
    a,d = get_all_deps(p, max_date)
    if a == -1:
        return -1
    return set(a) - set(d)
    
# get_all_deps('matplotlib')
# get_all_deps('actionbar.babble')
# get_all_deps('zope')

# get_transitive_deps('zope')
# get_transitive_deps('actionbar.babble')

# get_direct_deps('matplotlib')

In [None]:
# tran_deps = subset.loc[:,'package_name'].progress_apply(get_transitive_deps)
# tran_deps_lens = tran_deps.apply(lambda x: len(x))
# val_counts = requirements['requirement'].value_counts()
# dir_deps_lens = subset.loc[:,'package_name'].progress_apply(lambda x: len(get_direct_deps(x)))

In [None]:
# print(tran_deps_lens.sum())
# print(dir_deps_lens.sum())
# print(tran_deps_lens.sum() / dir_deps_lens.sum())

# Automated Process

In [None]:
first_time = requirements['date'].min()
last_time = requirements['date'].max()
print('soonest start time:', first_time)
print('latest start time:', last_time)

In [None]:
requirements.loc[:,'tran_deps'] = \
    requirements.progress_apply(lambda x: get_transitive_deps(x['package_name'], x['date']), axis=1)
requirements.loc[:,'tran_deps_lens'] = requirements.loc[:,'tran_deps'].apply(lambda x: len(x))
requirements.loc[:,'dir_deps_lens'] = \
    requirements.apply(lambda x: len(get_direct_deps(x['package_name'], x['date'])), axis=1)
requirements.loc[:,'ratios'] = requirements.progress_apply(lambda x: (x['tran_deps_lens'] / x['dir_deps_lens']), axis=1)

In [None]:
project_counts = {}
project_counts['StartDate'] = []
project_counts['EndDate'] = []
project_counts['ratios'] = []

start_time = add_a_month(first_time)
end_time = add_three_months(start_time)
while end_time < last_time.replace(day=1):
    project_counts['StartDate'].append(start_time)
    project_counts['EndDate'].append(end_time)

    idxs = (requirements['date'] > start_time) & (requirements['date'] < end_time)
    subset = requirements.loc[idxs, :]

#     subset.loc[:,'tran_deps'] = subset.apply(lambda x: get_transitive_deps(x['package_name'], x['date']), axis=1)
#     subset.loc[:,'tran_deps_lens'] = subset.loc[:,'tran_deps'].apply(lambda x: len(x))
#     subset.loc[:,'dir_deps_lens'] = \
#         subset.apply(lambda x: len(get_direct_deps(x['package_name'], x['date'])), axis=1)
#     subset.loc[:,'ratios'] = subset.apply(lambda x: (x['tran_deps_lens'] / x['dir_deps_lens']), axis=1)

    ratio = subset.loc[:,'ratios'].mean()
    print('Time:', start_time, '-', ratio)

    project_counts['StartDate'].append(start_time)
    project_counts['EndDate'].append(end_time)
    project_counts['ratios'].append(ratio)


    start_time = end_time
    end_time = add_three_months(start_time)

In [None]:
plt.figure(figsize=(10,8))

plt.plot(project_counts['StartDate'], project_counts['ratios'])
plt.xlabel('Time')
plt.ylabel('Transitive Dependency Ratio')