In [1]:
%%html
<style> body { font-family: "Times New Roman"} </style>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('DATA_CRAN/CRAN_DATA.csv', sep='|', low_memory=False)
df.shape

(87912, 48)

In [4]:
df['BugReports'] = df['BugReports'].str.replace('https//','https://')
df['BugReports'] = df['BugReports'].str.replace('http//','http://')
df['BugReports'] = df['BugReports'].str.replace('\n','')

Obtenemos la última versíón de cada paquete

In [5]:
df_lv = df.dropna(subset=['DatePublication']).copy()
df_lv.set_index(["Package"], inplace=True)
df_lv.sort_values('DatePublication', ascending=False, inplace=True)

In [6]:
def last_version_package_by_year(df):
    return df.iloc[[0]]

# Con la fecha de publicación ordenada, se agrupa por paquete para obtener la última versión publicada del paquete
dfl_lv = df_lv.groupby(level=[0], group_keys=False).apply(last_version_package_by_year)
dfl_lv = dfl_lv.reset_index()
dfl_lv.shape

(14891, 48)

Url de *bugs* reportados

In [7]:
BUGS = dfl_lv[['Package','BugReports']].dropna()
BUGS.shape

(4677, 2)

In [8]:
BUGS.head()

Unnamed: 0,Package,BugReports
5,ABHgenotypeR,http://github.com/StefanReuscher/ABHgenotypeR/...
12,ACMEeqtl,https://github.com/andreyshabalin/ACMEeqtl/issues
13,ACNE,https://github.com/HenrikBengtsson/ACNE/issues
23,ADMMsigma,https://github.com/MGallow/ADMMsigma/issues
25,ADPclust,https://github.com/ethanyxu/ADPclust/issues


In [9]:
regex_ex = r'(.*://\w+[.|]\w+/)' # Extrae la cadena dentro de los caracteres []
BUGS['BugProtocol'] = BUGS['BugReports'].str.extract(regex_ex, expand=True)
BUGS.dropna(inplace=True)

Repositorios en donde se registras los Bugs

In [10]:
BUGS.groupby('BugProtocol')[['BugReports']].count()

Unnamed: 0_level_0,BugReports
BugProtocol,Unnamed: 1_level_1
http://github.com/,298
http://rfaqs.com/,3
https://bitbucket.com/,2
https://bitbucket.org/,62
https://github.com/,4115
https://gitlab.com/,44
https://goo.gl/,1
https://notabug.org/,2
https://sourceforge.net/,2
https://stackoverflow.com/,1


Extraemos el número de Bugs de los paquetes

In [349]:
from time import sleep, time
from multiprocessing import Pool, cpu_count
from bs4 import BeautifulSoup
import requests
import urllib.request
import datetime
import json
import asyncio, re
from aiohttp import ClientSession
loop = asyncio.get_event_loop()

In [398]:
MAX_SIM_CONNS = 100

async def fetch(url, pkg, protocol, session):
    async with session.get(url) as response:
         text = await response.read()
    
    data = {}
    soup = BeautifulSoup(text, "lxml")
    
    if (protocol == 'https://github.com/'):
        data['Repository'] = 'https://github.com/'
        for main in soup.findAll('main'):
            pkg = main.findAll('strong', itemprop="name")
            print(len(pkg))
            if (len(pkg) > 0):
                numBugs = main.find('span', class_='Counter')

                if numBugs != None:
                    data['Package'] = pkg[0].get_text()
                    data['Bugs'] = numBugs.get_text()
                else:
                    data['Package'] = pkg[0].get_text()
                    data['Bugs'] = 0
    
    if (protocol == 'http://github.com/'):
        
        for main in soup.findAll('main'):
            pkg = main.findAll('strong', itemprop="name")
            data['Repository'] = 'http://github.com/'
            if (len(pkg) > 0):
                numBugs = main.find('span', class_='Counter')

                if numBugs != None:
                    data['Package'] = pkg[0].get_text()
                    data['Bugs'] = numBugs.get_text()
                else:
                    data['Package'] = pkg[0].get_text()
                    data['Bugs'] = 0

    if (protocol == 'https://bitbucket.org/'):
        #data['Package'] = pkg
        numBugs = soup.find('span', class_='secondary')
        
        if numBugs != None:
            value = numBugs.get_text()
            data['Bugs'] = value.replace(r'(\d+.*(of )|\)|\()','').strip()
        else:
            data['Bugs'] = 0
    
    if (protocol == 'https://gitlab.com/'):
        #data['Package'] = pkg
        numBugs = soup.find('span', class_='issue_counter')
        if numBugs != None:
            data['Bugs'] = numBugs.get_text().replace(r'\n','').strip()
        else:
            data['Bugs'] = 0

    return data

df_Bugs = []

async def bound_fetch(sem, url, pkg, protocol, session):
    async with sem:
        res = await fetch(url, pkg, protocol, session)
        df_Bugs.append(res)
        
async def fetch_all():
    tasks = set()
    async with ClientSession() as session:
        sem = asyncio.Semaphore(MAX_SIM_CONNS)
        
        for index, row in BUGS.iterrows(): # all
        #for index, row in BUGS[BUGS['Package']=='ggplot2'].iterrows(): # all
            url_pkg = row['BugReports']
            pkg = row['Package']
            protocol = row['BugProtocol']
            
            task = asyncio.create_task(bound_fetch(sem, url_pkg, pkg, protocol, session))
            tasks.add(task)
        
            #await asyncio.sleep(1)
        return await asyncio.gather(*tasks)

if __name__ == '__main__':
    start_time = time()
    
    loop = asyncio.get_event_loop()
    loop.create_task(fetch_all())
    
    end_time = time()
    elapsed_time = end_time - start_time
    
print(f'Elapsed run time: {elapsed_time} seconds')

Elapsed run time: 2.7179718017578125e-05 seconds
1
1
1
1
1
1
1
1


In [404]:
len(df_Bugs)

1671

In [405]:
dfl_B = pd.DataFrame.from_dict(df_Bugs)
dfl_B.shape

(1717, 3)

In [406]:
dfl_B.isna().sum()

Bugs          1162
Package       1209
Repository     142
dtype: int64

In [407]:
dfl_B.head()

Unnamed: 0,Bugs,Package,Repository
0,2,bh,https://github.com/
1,0,Bclim,https://github.com/
2,1,BIGDAWG,https://github.com/
3,0,AGD,https://github.com/
4,6,CGPfunctions,https://github.com/


In [408]:
colnames = dfl_B.columns.tolist()
dfl_B.to_csv("DATA_CRAN/BUGS_REPORTS.csv", header=colnames, sep="|", index=False)

In [409]:
BUGS[BUGS['Package']=='ggplot2']

Unnamed: 0,Package,BugReports,BugProtocol
8064,ggplot2,https://github.com/tidyverse/ggplot2/issues,https://github.com/
