In [12]:
# Required Packages
import json
import datetime

import pandas as pd
import numpy as np

import string

In [13]:
string.digits

'0123456789'

#### Description of the extracted information:
- Package Name
- requires_python (These are version dependencies)
- requires_dist (These are package dependencies)
- Summary (Similar to the Title variable for R)
- Version
- Email and maintainer_email
- Upload Time (Looks like when package was created)
- Organization (captured from either email or maintainer_email)
- Downloads

In [14]:
# Function to flatten nested lists, to clean version numbers
def clean(version):
    allowed = string.digits + '.'
    return ''.join([c for c in version if c in allowed])

In [15]:
# Function to flatten the metadata and extract the relevant information
def flatten_metadata(m):
    dd = json.loads(m)
    python_versions = dd['info']['requires_python']
    dependencies = dd['info']['requires_dist']
    project_urls = dd['info']['project_urls']
    versions = list(dd['releases'].keys())
    try:
        versions_cleaned = list(set([clean(v) for v in versions]))
        versions_cleaned.sort(key=lambda s: [int(u) for u in s.split('.')])
        first_version = versions[0]
        if first_version not in versions_cleaned:
            available_versions = [i for i in versions if first_version in i]
            first_version = available_versions[0]
    except:
        first_version = versions[0]
    dates = [datetime.datetime.strptime(i['upload_time'], '%Y-%m-%dT%H:%M:%S') for i in dd['releases'][first_version]]
    if len(dates) == 0:
        created_at = None
    else:
        earliest_date = min(dates)
        created_at = earliest_date.strftime('%Y-%m-%d %H:%M:%S')
    try:
        summary = dd['info']['summary']
    except:
        summary = None
    try:
        author = dd['info']['author']
    except:
        author = None
    return python_versions, dependencies, project_urls, versions, created_at, summary, author

In [16]:
# Create a blank csv with headers
cols = ['PackageName', 'Metadata', 'python_versions', 'dependencies', 'project_urls', 'versions', 'created_at', 'summary', 'author']
df = pd.DataFrame(columns=cols)

# Path?
df.to_csv('/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Python_data_collection/PyPi_metadata_flattened_wh.csv', index=False)

In [17]:
chunksize = 100000
for df in pd.read_csv("/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Python_data_collection/PyPi_metadata.csv", chunksize=chunksize):
    df['python_versions'], df['dependencies'], df['project_urls'], df['versions'], df['created_at'], df['summary'], df['author'] = zip(*df.Metadata.apply(flatten_metadata))
    df.to_csv("/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Python_data_collection/PyPi_metadata_flattened_wh.csv", mode='a', header=False, index=False)

In [18]:
df.columns

Index(['PackageName', 'Metadata', 'python_versions', 'dependencies',
       'project_urls', 'versions', 'created_at', 'summary', 'author'],
      dtype='object')

In [21]:
chunksize = 2000
for df in pd.read_csv("/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Python_data_collection/PyPi_metadata_flattened_wh.csv", chunksize=chunksize):
    break

In [22]:
df

Unnamed: 0,PackageName,Metadata,python_versions,dependencies,project_urls,versions,created_at,summary,author
0,ghicon,"{""info"":{""author"":""Ali Azam"",""author_email"":""a...",,,{'Download': 'https://github.com/agzg/ghicon/a...,"['0.0.0', '0.0.1', '4.2.0', '5.2.0', '5.2.1', ...",2022-06-26 20:16:32,GitHubesque identicon generator.,Ali Azam
1,hippo-dev,"{""info"":{""author"":""Ben Pearce"",""author_email"":...",>=3.8,,{'Homepage': 'https://idea-org.github.io/'},['0.0.1'],2022-08-26 12:44:58,,Ben Pearce
2,procedural,"{""info"":{""author"":""Christian Kongsgaard"",""auth...",">=3.6,<4.0","['requests (>=2.24.0,<3.0.0)']",{'Homepage': 'https://github.com/procedural-bu...,"['0.1.0', '0.1.1']",2020-07-15 13:51:53,A Python client for interacting with Procedura...,Christian Kongsgaard
3,teu,"{""info"":{""author"":""Pritam Soni"",""author_email""...",">=3.9,<4.0",,,['0.1.0'],2022-09-13 21:03:43,,Pritam Soni
4,reef-interface,"{""info"":{""author"":""Reef Finance"",""author_email...",">=3.6, <4","['websocket-client (~=0.58.0)', 'base58 (~=2.0...",,"['0.1.0', '0.1.1', '0.1.2', '0.1.3', '1.0.0', ...",2021-07-07 16:34:14,Library for interfacing with a Reef node,Reef Finance
...,...,...,...,...,...,...,...,...,...
1995,CircuitSAT,"{""info"":{""author"":""Oscar Riveros"",""author_emai...",,,{'Homepage': 'https://twitter.com/maxtuno'},"['0.0.0', '0.0.1', '0.0.3', '0.0.4', '0.0.5', ...",2022-06-24 07:13:37,A Circuit SAT Language that emit CNF code.,Oscar Riveros
1996,Kpa,"{""info"":{""author"":""Peter VandeHaar"",""author_em...",>=3.4,['boltons (~=20.2)'],{'Homepage': 'https://github.com/pjvandehaar/k...,"['1.0.0', '1.0.1', '1.0.10', '1.0.11', '1.0.12...",2018-06-04 02:50:59,<forthcoming>,Peter VandeHaar
1997,odoo10-addon-l10n-it-fatturapa-in-rc,"{""info"":{""author"":""Efatto.it di Sergio Corato,...",~=2.7,"['odoo10-addon-l10n-it-account-tax-kind', 'odo...",{'Homepage': 'https://github.com/OCA/l10n-ital...,"['10.0.1.1.0', '10.0.1.1.1', '10.0.2.0.0', '10...",2019-08-06 04:53:57,Modulo di collegamento tra e-fattura in acquis...,"Efatto.it di Sergio Corato, Odoo Community Ass..."
1998,lhutils,"{""info"":{""author"":"""",""author_email"":""lh <tasbo...",>=3.7,,{'Bug Tracker': 'https://github.com/tasbox/lhu...,"['0.0.3', '0.0.4', '0.0.5']",2022-07-12 10:33:55,对常用代码进行封装，防止自己重复造轮子。,
