# R Packages Maintainer

In [83]:
import pandas
import re

from matplotlib import pyplot as plt

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

data = pandas.DataFrame.from_csv('../data/github-cran-150601.csv')
data['Date'] = pandas.to_datetime(data['Date'])

In [84]:
print len(data.drop_duplicates('Maintainer')), 'maintainer values'

8533 maintainer values


In [85]:
def clean_maintainer(v):
    v = v if isinstance(v, str) else ''
    words = map(str.strip, v.split(' '))
    words = map(lambda w: re.sub(r'[^@a-zA-Z]', '', w), words)
    words = filter(lambda w: len(w) > 2 and '@' not in w, words)
    words.sort()
    return ' '.join(words)

data['Maintainer_can'] = data['Maintainer'].apply(clean_maintainer)
print len(data.drop_duplicates('Maintainer_can')), 'canonical maintainer values'

6044 canonical maintainer values


In [103]:
_maint_first = data.sort('Date').drop_duplicates('Maintainer_can').set_index('Maintainer_can')[['Date', 'InGitHub', 'InCRAN']]
_maint_last = data.sort('Date', ascending=False).drop_duplicates('Maintainer_can').set_index('Maintainer_can')[['Date', 'InGitHub', 'InCRAN']]

github = data.query('InGitHub == 1').sort('Date')
cran = data.query('InCRAN == 1').sort('Date')

__f = (lambda d,rev,name: d.drop_duplicates('Maintainer_can', take_last=rev)
               .set_index('Maintainer_can')[['Date']]
               .rename(columns={'Date':name}))

maintainers = __f(github, False, 'GitHubFirstDate').join(
    [__f(github, True, 'GitHubLastDate'), __f(cran, False, 'CRANFirstDate'), __f(cran, True, 'CRANLastDate'),],
    how='outer')

In [166]:
maintainers['DaysOnGitHub'] = maintainers['GitHubLastDate'] - maintainers['GitHubFirstDate']
maintainers['DaysOnCRAN'] = maintainers['CRANLastDate'] - maintainers['CRANFirstDate']
# We consider CRAN first
maintainers['CRANFirst-->GHFirst'] = maintainers['GitHubFirstDate'] - maintainers['CRANFirstDate'] 
maintainers['CRANLast-->GitHubLast'] = maintainers['GitHubLastDate'] - maintainers['CRANLastDate']
maintainers['CRANLast-->GitHubFirst'] = maintainers['GitHubFirstDate'] - maintainers['CRANLastDate']
maintainers['CRANFirst-->GitHubLast'] = maintainers['GitHubLastDate'] - maintainers['CRANFirstDate']

fields = ['DaysOnGitHub', 'DaysOnCRAN', 'CRANFirst-->GHFirst', 'CRANLast-->GitHubLast', 'CRANLast-->GitHubFirst', 'CRANFirst-->GitHubLast']
for field in fields:
    maintainers[field] = maintainers[field].astype('timedelta64[D]')

In [167]:
from collections import OrderedDict

testdata = OrderedDict([
    ('all', maintainers), 
    ('on CRAN', maintainers.dropna(subset=['CRANFirstDate', 'CRANLastDate'])),
    ('on GH', maintainers.dropna(subset=['GitHubFirstDate', 'GitHubLastDate'])),
    ('on both', maintainers.dropna()),
    ('on both at least 1 minute', maintainers.dropna().query('GitHubFirstDate <= CRANLastDate and GitHubLastDate >= CRANFirstDate')),
    ('first on CRAN', maintainers.dropna().query('CRANFirstDate <= GitHubFirstDate')),
    ('first on GH', maintainers.dropna().query('GitHubFirstDate <= CRANFirstDate')),
    ('migrate from GH to CRAN', maintainers.dropna().query('GitHubLastDate <= CRANFirstDate')),
    ('migrate from CRAN to GH', maintainers.dropna().query('CRANLastDate <= GitHubFirstDate')),
])

for key, value in testdata.iteritems():
    print key
    print len(value), 'maintainers of', len(data[data['Maintainer_can'].isin(value.index)].drop_duplicates(subset=['Package'])), 'packages'
    for field in fields:
        print field, '| mean:', np.mean(value[field]), '| median:', np.median(value[field]), '| stddev:', np.std(value[field])
    print

all
6044 maintainers of 11442 packages
DaysOnGitHub | mean: 275.196850394 | median: nan | stddev: 452.203533009
DaysOnCRAN | mean: 817.769325658 | median: 710.0 | stddev: 1084.83214477
CRANFirst-->GHFirst | mean: 597.544827586 | median: nan | stddev: 1073.18309637
CRANLast-->GitHubLast | mean: 103.053793103 | median: nan | stddev: 395.991648993
CRANLast-->GitHubFirst | mean: -355.652413793 | median: nan | stddev: 707.211612879
CRANFirst-->GitHubLast | mean: 1056.27310345 | median: nan | stddev: 1169.11888775

on CRAN
4864 maintainers of 10125 packages
DaysOnGitHub | mean: 458.275862069 | median: nan | stddev: 581.269491571
DaysOnCRAN | mean: 817.769325658 | median: 380.5 | stddev: 1084.83214477
CRANFirst-->GHFirst | mean: 597.544827586 | median: nan | stddev: 1073.18309637
CRANLast-->GitHubLast | mean: 103.053793103 | median: nan | stddev: 395.991648993
CRANLast-->GitHubFirst | mean: -355.652413793 | median: nan | stddev: 707.211612879
CRANFirst-->GitHubLast | mean: 1056.27310345 | med