In [1]:
import requests
import requests_futures.sessions  
from tqdm.notebook import tqdm
import xmltodict
import lxml.html

import concurrent.futures
import random
import collections
import json

In [2]:
def parse_xml(xml_data: str):
    def od2d(val):
        if isinstance(val, (collections.OrderedDict, dict)):
            return {k: od2d(v) for k, v in val.items()}
        elif isinstance(val, (tuple, list)):
            return [od2d(v) for v in val]
        else:
            return val
    return od2d(xmltodict.parse(xml_data))

In [3]:
def parse_html(html_string: str):
    links = set()
    html = lxml.html.fromstring(html_string)
    for alert_info in list(html.find_class('alert alert-info')):
        links |= set([link[2] for link in alert_info.iterlinks()])

    duplicate_link = None
    no_code_link = None
    for link in links:
        if 'enlistments' in link:
            no_code_link = link
        else:
            duplicate_link = link
            
    project_id = resp.url.split('/')[4]
    return {'project_id': project_id, 'duplicate_link': duplicate_link, 'no_code_link': no_code_link}

In [4]:
def load_from_futures(futures, key : str):
    results = []
    for future in tqdm(futures):
        resp = future.result()
        assert resp.status_code in [200, 404], 'status code {} found'.format(resp.status_code)

        project_id = resp.url.split('/')[4]

        d = parse_xml(resp.text)

        try:
            res = d['response']['result'][key]
            if isinstance(res, dict):
                res = [res]
            for r in res:
                r['project_id'] = project_id
                results += [r]
        except KeyError as e:
            pass
    return results

In [5]:
with open('./api_keys.json', 'r') as f:
    api_keys = json.load(f)

In [6]:
retries = requests.packages.urllib3.util.retry.Retry(
    total=10, 
    backoff_factor=1, 
    status_forcelist=[401, 429, 500, 502, 503, 504]
)
http_adapter = requests.adapters.HTTPAdapter(max_retries=retries)
timeout = 10*60

In [7]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)

    pages_futures = [session.get('https://www.openhub.net/projects.xml', params={'page': page, 'api_key': random.choice(api_keys)}, timeout=timeout) for page in range(1, 50000)]
    
    for future in tqdm(concurrent.futures.as_completed(pages_futures), total=len(pages_futures)):
        future.done()

  0%|          | 0/49999 [00:00<?, ?it/s]

In [None]:
project_ids = []
for future in tqdm(pages_futures):
    resp = future.result()
    assert(resp.status_code==200)
    xml_response = xmltodict.parse(resp.text)['response']
    if 'result' in xml_response:
        for project in xml_response['result']['project']:
            project_ids += [project['id']]
project_ids = list(set(project_ids))

  0%|          | 0/49999 [00:00<?, ?it/s]

In [10]:
# with open('./../data/project_ids.json', 'w') as f:
#     json.dump(project_ids, f)
with open('./../data/project_ids.json', 'r') as f:
    project_ids = json.load(f)

In [13]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)
    
    activity_facts_futures = [session.get('https://www.openhub.net/projects/{}/analyses/latest/activity_facts.xml'.format(project_id), params={'api_key': random.choice(api_keys)}, timeout=timeout) for project_id in project_ids]
    
    for future in tqdm(concurrent.futures.as_completed(activity_facts_futures), total=len(activity_facts_futures)):
        future.done()

  0%|          | 0/352916 [00:00<?, ?it/s]

In [None]:
activity_facts = load_from_futures(activity_facts_futures, 'activity_fact')

  0%|          | 0/352916 [00:00<?, ?it/s]

In [None]:
with open('./../data/activity_facts1.json', 'w') as f:
    json.dump(activity_facts, f)

---

In [None]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)

    analyses_futures = [session.get('https://www.openhub.net/projects/{}/analyses/latest.xml'.format(project_id), params={'api_key': random.choice(api_keys)}, timeout=timeout) for project_id in project_ids]
    
    for future in tqdm(concurrent.futures.as_completed(analyses_futures), total=len(analyses_futures)):
        future.done()

In [None]:
analyses = load_from_futures(analyses_futures, 'analysis')

In [None]:
with open('./../data/analyses1.json', 'w') as f:
    json.dump(analyses, f)

In [None]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)

    html_futures = [session.get('https://www.openhub.net/p/{}.html'.format(project_id), timeout=timeout) for project_id in project_ids]
    
    for future in tqdm(concurrent.futures.as_completed(html_futures), total=len(html_futures)):
        future.done()

  0%|          | 0/352916 [00:00<?, ?it/s]

In [None]:
html_pages = [load_html(future.result().text) for future in tqdm(html_futures)]

In [None]:
with open('./../data/html_pages.json', 'w') as f:
    json.dump(html_pages, f)