In [76]:
import pandas
import networkx
import matplotlib
import seaborn

from packaging.requirements import Requirement, InvalidRequirement
from packaging.utils import canonicalize_name

from collections import Counter

%matplotlib inline

In [61]:
INPUT_PACKAGES = '../data/metadata.csv'
OUTPUT_GRAPH = '../data/latest-graph.gml'

df = pandas.read_csv(INPUT_PACKAGES, 
                    usecols=['info_name', 'info_requires', 'info_requires_dist', 'urls_upload_time'], 
                    parse_dates=['urls_upload_time'])

In [62]:
latests = df.dropna(subset=['urls_upload_time']).groupby('info_name').urls_upload_time.max()
data = latests.reset_index().merge(df, how='left', on=['info_name', 'urls_upload_time'])

In [63]:
packages = {}

for row in data.itertuples():
    deps = []
    # Parse requirements
    try:
        deps.extend(eval(row.info_requires))
    except Exception:
        pass
    try:
        deps.extend(eval(row.info_requires_dist))
    except Exception:
        pass
    
    # Clean requirements
    cleaned_deps = set()
    for dep in deps:
        try:
            cleaned_deps.add(canonicalize_name(Requirement(dep).name))
        except InvalidRequirement:
            pass
        
    packages[row.info_name] = cleaned_deps

In [67]:
graph = networkx.DiGraph()
graph.add_nodes_from(packages.keys())
for source, deps in packages.items():
    graph.add_edges_from((source, target) for target in deps)
networkx.write_gml(graph, OUTPUT_GRAPH)

# Overview

In [91]:
print('Number of packages', graph.order())
print('Number of dependencies', graph.size())
print('Strongly connected components', networkx.number_strongly_connected_components(graph))
print('(Weakly) connected components', networkx.number_weakly_connected_components(graph))
print('Number of non-isolated package', len([True for n, d in graph.degree_iter() if d > 0]))
print('Number of packages with dependencies', len([True for n, d in graph.out_degree_iter() if d > 0]))
print('Number of packages with reverse dependencies', len([True for n, d in graph.in_degree_iter() if d > 0]))

Number of packages 75029
Number of dependencies 17877
Strongly connected components 75015
(Weakly) connected components 67502
Number of non-isolated package 7735
Number of packages with dependencies 5198
Number of packages with reverse dependencies 3067


In [92]:
print('Weakly connected components size:', 
      Counter(len(component) for component in networkx.weakly_connected_components(graph)))
print('Strongly connected components size:', 
      Counter(len(component) for component in networkx.strongly_connected_components(graph)))

Weakly connected components size: Counter({1: 67295, 2: 167, 3: 23, 4: 7, 5: 5, 6: 2, 9: 1, 7226: 1, 31: 1})
Strongly connected components size: Counter({1: 75007, 2: 5, 3: 2, 6: 1})


In [97]:
print('Average shortest path length for weakly connected components:')
for component in networkx.weakly_connected_component_subgraphs(graph):
    try:
        print(networkx.average_shortest_path_length(component), end=' ')
    except ZeroDivisionError:
        pass

Average shortest path length for weakly connected components:
0.0010609515618819774 0.5 0.5 0.5 0.035483870967741936 0.5 0.25 0.5 0.5 0.5 0.3333333333333333 0.3333333333333333 0.3333333333333333 0.5 0.25 0.2 0.3333333333333333 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.3333333333333333 0.3333333333333333 0.3333333333333333 0.5 0.25 0.5 0.5 0.5 0.5 0.2 0.5 0.4166666666666667 0.2 0.5 0.5 0.16666666666666666 0.5 0.3333333333333333 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.3333333333333333 0.3333333333333333 0.5 0.3333333333333333 0.5 0.5 0.5 0.3333333333333333 0.2 0.5 0.5 0.5 0.5 0.5 0.5 0.3333333333333333 0.5 0.3333333333333333 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.25 0.5 0.5 0.5 0.5 0.5 0.5 0.1111111111111111 0.5 0.5 0.2 0.5 0.16666666666666666 0.5 0.5 0.5 0.5 0.3333333333333333 0.5 0.3333333333333333 0.3333333333333333 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.3333333333333333 0.5 0.5 0.5 0.3333333333333333 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.6666666666666666 0.5 0.5 0.5 0.5 0

In [99]:
print('Average clustering coefficient:', networkx.average_clustering(graph.to_undirected()))

Average clustering coefficient: 0.0049766830210980755
