In [11]:
import pandas
from packaging.requirements import Requirement, InvalidRequirement
from packaging.utils import canonicalize_name
import tqdm

In [3]:
INPUT_PACKAGES = '../data/metadata.csv'

data = pandas.read_csv(INPUT_PACKAGES, 
                    usecols=['info_name', 'info_version', 'info_requires', 'info_requires_dist', 'urls_upload_time'], 
                    parse_dates=['urls_upload_time'])

In [7]:
packages = (data
            .rename(columns={'info_name': 'package',
                             'info_version': 'version', 
                             'urls_upload_time': 'time'})
            [['package', 'version', 'time']]
            .set_index(['package', 'version'])
)

In [12]:
rows = []

for row in tqdm.tqdm_notebook(data.itertuples(), total=len(data)):
    deps = []
    # Parse requirements
    try:
        deps.extend(eval(row.info_requires))
    except Exception:
        pass
    try:
        deps.extend(eval(row.info_requires_dist))
    except Exception:
        pass
    
    # Clean requirements
    for dep in deps:
        try:
            dep = canonicalize_name(Requirement(dep).name)
        except InvalidRequirement:
            pass
        rows.append((row.info_name, row.info_version, dep))
    
deps = pandas.DataFrame(rows, columns=['package', 'version', 'dependency'])    




In [14]:
packages.to_csv('../data/packages.csv')
deps.to_csv('../data/deps.csv', index=False)