# How do package dependency network grow over time?

In this notebook we plan to investigate the dependencies in the Pypi dataset and try to look at
how they change over time.

## Loading data and importing libraries

In [1]:
%load_ext autoreload

# Auto reloading causes the kernel to reload the libraries we have
%autoreload 2

# usual imports for visualization, etc.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime

# make it reproducible
np.random.seed(0)

# show plots inline
%matplotlib inline

In [2]:
# data_folder = '../data/'
data_folder = 'https://zenodo.org/record/2620607/files/'

df_dep = pd.read_csv(data_folder + 'pypi_dependencies.csv', error_bad_lines=False, warn_bad_lines=False)
df_proj = pd.read_csv(data_folder + 'pypi_projects.csv', error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df_repo = pd.read_csv(data_folder + 'pypi_projects_with_repository_fields.csv', error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df_ver = pd.read_csv(data_folder + 'pypi_versions.csv', error_bad_lines=False, warn_bad_lines=False,low_memory=False)

for df in [df_dep, df_proj, df_repo, df_ver]:
    time_fields = []
    for col in df.columns:
        if 'timestamp' in col:
            time_fields.append(col)
    for time_field in time_fields:
        try:
            df[time_field] = df[time_field].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        except:
            pass
        
del df

## Defining helper functions and defaults

In [3]:
# Helper Functions
def add_three_months(t):
    for i in range(3):
        t = add_a_month(t)
    return t
def add_a_month(t):
    return (t.replace(day=1) + datetime.timedelta(days=31)).replace(day=1)
def print_time(t):
    return t.strftime("%Y-%m-%dT%H:%M:%S")

!mkdir figs
figs_dir = 'figs/'

mkdir: cannot create directory ‘figs’: File exists


In [4]:
from cycler import cycler
def set_plt_rc():
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    
    font = {'family': 'serif','size': BIGGER_SIZE}
    plt.rc('font', **font)

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
        
    plt.rc('axes', prop_cycle=(cycler(color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
                                         '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
                                         '#bcbd22', '#17becf']) +
                           cycler(linestyle=['-', '--', ':', '-.','-', '--', ':', '-.','-', '--'])))
    
set_plt_rc()

# Transitive Dependencies

In [219]:
data_folder = '../data/'

requirements = pd.read_csv(data_folder + 'reqs.csv')

subset = pd.read_csv(data_folder + 'data_2018.csv')
subset.loc[:,'package_name'] = subset.loc[:,'name']
subset.loc[:,'requirement'] = subset.loc[:,'deps']

requirements.loc[:,'package_name'] = requirements.loc[:,'package_name'].apply(lambda x: x.lower() if type(x)==str else x)
requirements.loc[:,'requirement'] = requirements.loc[:,'requirement'].apply(lambda x: x.lower() if type(x)==str else x)

requirements.head()

Unnamed: 0,package_name,requirement
0,0.0.1,tensorflow
1,0.0.1,pandas
2,115wangpan,pycurl
3,115wangpan,six
4,115wangpan,humanize


In [220]:
from tqdm.autonotebook import tqdm

tqdm.pandas()

In [221]:
dep_cache = {}

def get_direct_deps(p):
    direct_deps = list(requirements.loc[requirements.loc[:, 'package_name'] == p, 'requirement'])
#     print('direct_deps', p, direct_deps)
    return direct_deps

def get_all_deps(p, level=1, old_deps=None):
    if old_deps is None:
        old_deps = []
    if level > 5:
        return -1, -1
    if p in dep_cache:
        return dep_cache[p]
    
#     print(p)
    
    direct_deps = get_direct_deps(p)
    all_deps = []
    for req in direct_deps:
        if req in old_deps:
            continue
        d, _ = get_all_deps(req, level+1, all_deps + old_deps)
        if d == -1:
            return list(set(all_deps)), list(set(direct_deps))
#             return -1, -1
        all_deps += d
    
    all_deps += direct_deps
    
    dep_cache[p] = (list(set(all_deps)), list(set(direct_deps)))
    return list(set(all_deps)), list(set(direct_deps))

def get_transitive_deps(p):
    a,d = get_all_deps(p)
    if a == -1:
        return -1
    return set(a) - set(d)
    
# get_all_deps('matplotlib')
# get_all_deps('actionbar.babble')

# get_transitive_deps('zope')
# get_transitive_deps('actionbar.babble')

In [None]:
tran_deps = subset.loc[:,'package_name'].progress_apply(get_transitive_deps)

HBox(children=(IntProgress(value=0, max=17417), HTML(value='')))

In [None]:
tran_deps_lens = tran_deps.apply(lambda x: len(x))
val_counts = requirements['requirement'].value_counts()
dir_deps_lens = subset.loc[:,'package_name'].progress_apply(lambda x: len(get_direct_deps(x)))

In [None]:
print(tran_deps_lens.sum())
print(dir_deps_lens.sum())
print(tran_deps_lens.sum() / dir_deps_lens.sum())

In [117]:
trans_deps = []

idx = requirements['package_name'].isin(subset['package_name'])

transreqs = requirements.loc[idx,:]
transreqs.loc[:,'counts'] = transreqs.apply(lambda x: 1, axis=1)

direct_deps = requirements.shape[0]

transreqs_old = requirements
transreqs_old.loc[:,'counts'] = transreqs_old.apply(lambda x: 1, axis=1)
last_count = 0
for i in range(100):
    # look at next level
#     transreqs = transreqs[transreqs_old['package_name'].isin(transreqs_old['requirement'])]
    transreqs = transreqs_old[transreqs_old['package_name'].isin(transreqs['requirement'])]
    
    val_counts = transreqs_old['requirement'].value_counts()
    transreqs.loc[:,'counts'] = transreqs['package_name'].progress_apply(lambda x: val_counts[x])

#     transreqs.loc[:,'counts'] = transreqs['package_name'].progress_apply(
#         lambda x: transreqs_old.loc[transreqs_old['requirement'] == x, 'counts'].sum()
#     )
    
#     if i > 0:
#         all_groups_counts = transreqs_old.groupby('requirement')['counts'].sum()
#         transreqs.loc[:,'counts'] = transreqs['package_name'].progress_apply(
#             lambda x: all_groups_counts[x]
#         )
#     else:
#         val_counts = transreqs_old['requirement'].value_counts()
#         transreqs.loc[:,'counts'] = transreqs['package_name'].progress_apply(lambda x: val_counts[x])
    
    new_count = transreqs.shape[0]
    
    trans_deps += [transreqs.loc[:,'counts'].sum()]
    
    if new_count == last_count:
        print('maximum depth until got to loops:', i)
        break
    last_count = new_count
    transreqs_old = transreqs
    
print(direct_deps)
print(trans_deps)
# print(trans_deps / direct_deps)

HBox(children=(IntProgress(value=0, max=14047), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7951), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5814), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4938), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4586), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4392), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4233), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4140), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4096), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4048), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3979), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3901), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3860), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3853), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3853), HTML(value='')))

maximum depth until got to loops: 14
212170
[407847, 65686, 45305, 35734, 32057, 30744, 30077, 29363, 28830, 28545, 28267, 27927, 27571, 27415, 27407]


In [128]:
direct_deps = requirements.shape[0]

trans_deps = 0

idx = requirements['package_name'] == 'matplotlib'

transreqs = requirements.loc[idx,:]
transreqs.loc[:,'counts'] = transreqs.apply(lambda x: 1, axis=1)
transreqs_old = requirements
transreqs_old.loc[:,'counts'] = transreqs_old.apply(lambda x: 1, axis=1)
last_count = 0

transreqs

i = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [129]:
transreqs = transreqs_old[transreqs_old['package_name'].isin(transreqs['requirement'])]

all_groups_counts = transreqs_old.groupby('requirement')['counts'].sum()
transreqs.loc[:,'counts'] = transreqs['package_name'].progress_apply(
    lambda x: all_groups_counts[x]
)

new_count = transreqs.shape[0]

trans_deps += transreqs.loc[:,'counts'].sum()

if new_count == last_count:
    print('maximum depth until got to loops:', i)
last_count = new_count
transreqs_old = transreqs


i += 1
transreqs

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Unnamed: 0,package_name,requirement,counts
151627,python-dateutil,six,2086


Unnamed: 0,package_name,requirement,counts
