# Merging GitHub and CRAN data

**TODO:** Add the *real* release date on CRAN. 

In [1]:
import pandas

In [6]:
github = pandas.DataFrame.from_csv('../data/github-raw-150601.csv', index_col=None).drop(labels='Unnamed: 0', axis=1).dropna(subset=('Package', 'Version'))
cran = pandas.DataFrame.from_csv('../data/cran-150601.csv', index_col=None)

In [None]:
fields = ['Package', 'Version', 'Source', 'Date', 'Author', 'Authors', 'License', 'Suggests', 'Imports', 
          'Depends', 'Owner', 'Repository', 'CommitDate', 'CRANRelease', 'SnapshotFirstDate', 'SnapshotLastDate']

OUTPUT = '../data/github-cran-150601.csv'

In [None]:
def parse_dependencies(str_list, ignored=[]):
    """
    Return a list of strings where each string is a package name not in `ignored`.
    The input is a list of dependencies as contained in a DESCRIPTION file. 
    """
    # Check NaN
    str_list = str_list if str_list != pandas.np.nan else ''
    
    # Filter version numbers
    f = lambda lst: [dep.split('(')[0].strip() for dep in lst.split(',')]
    items = filter(lambda x: len(x) > 0, f(str_list))
    items = filter(lambda x: x not in ignored, items)
    return items

In [None]:
cran['Source'] = 'cran'
cran['Date'] = cran['SnapshotFirstDate']
github['Source'] = 'github'
github['Date'] = github['CommitDate']

# Merge
packages = pandas.concat([cran, github])

# Deal with dependencies lists
dependencies_formatter = lambda x: ' '.join(parse_dependencies(x))
for field in ['Suggests', 'Imports', 'Depends']:
    packages[field] = packages[field].fillna(value='').apply(dependencies_formatter)

# Convert date
packages['Date'] = pandas.to_datetime(packages['Date'])

# Remove useless packages (see http://cran.r-project.org/doc/manuals/r-release/R-exts.html#Creating-R-packages)
# The mandatory ‘Package’ field gives the name of the package. 
# This should contain only (ASCII) letters, numbers and dot, have at least two characters and 
# start with a letter and not end in a dot. 
packages = packages.dropna(subset=['Version', 'Package', 'Date'])
packages = packages[packages.Package.str.match(r'^[a-zA-Z][a-zA-Z0-9\.]+$')]

    
output = packages[fields].sort('Package')

In [None]:
output.to_csv(OUTPUT, encoding='utf-8')