In [78]:
import requests
import requests_futures.sessions  
from tqdm.notebook import tqdm
import xmltodict
import lxml.html

import concurrent.futures
import random
import collections
import json
from datetime import datetime
import hashlib

In [77]:
def now():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')

Due to the [Black Duck Academic Use Agreement](https://web.archive.org/web/20170619090829/https://blog.openhub.net/academic-use-agreement), we are required to anonymize or remove the project ids and names from the data set

In [266]:
def anonymize(data: list):
    def md5(s: str):
        return hashlib.md5(s.encode('utf-8')).hexdigest()
    
    anonymized_data = []
    for d in data:
        d = d.copy()
        d['project_id'] = md5(d['project_id'])
        
        if 'url_name' in d:
            d['url_name'] = md5(d['url_name'])
        
        if d.get('original_project_name') != None:
            d['original_project_name'] = md5(d['original_project_name'])
        
        for k in ['url', 'languages', 'factoids']:
            d.pop(k, None)
        anonymized_data += [d]
        
    return anonymized_data

In [30]:
def parse_xml(xml_data: str):
    def od2d(val):
        if isinstance(val, (collections.OrderedDict, dict)):
            return {k: od2d(v) for k, v in val.items()}
        elif isinstance(val, (tuple, list)):
            return [od2d(v) for v in val]
        else:
            return val
    return od2d(xmltodict.parse(xml_data))

In [32]:
def load_from_futures(futures, key : str):
    results = []
    for future in tqdm(futures):
        resp = future.result()
        assert resp.status_code in [200, 404], 'status code {} found'.format(resp.status_code)

        project_id = resp.url.split('/')[4]

        d = parse_xml(resp.text)

        try:
            res = d['response']['result'][key]
            if isinstance(res, dict):
                res = [res]
            for r in res:
                r['project_id'] = project_id
                results += [r]
        except KeyError as e:
            pass
    return results

In [33]:
with open('./api_keys.json', 'r') as f:
    api_keys = json.load(f)

In [34]:
retries = requests.packages.urllib3.util.retry.Retry(
    total=10, 
    backoff_factor=1, 
    status_forcelist=[401, 429, 500, 502, 503, 504]
)
http_adapter = requests.adapters.HTTPAdapter(max_retries=retries)
timeout = 10*60

---

# Get project IDs

In [None]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)

    pages_futures = [session.get('https://www.openhub.net/projects.xml', params={'page': page, 'api_key': random.choice(api_keys)}, timeout=timeout) for page in range(1, 50000)]
    
    for future in tqdm(concurrent.futures.as_completed(pages_futures), total=len(pages_futures)):
        future.done()

  0%|          | 0/49999 [00:00<?, ?it/s]

In [226]:
projects = []

for future in tqdm(pages_futures):
    resp = future.result()
    assert resp.status_code==200
    xml_response = xmltodict.parse(resp.text)['response']
    if 'result' in xml_response:
        for project in xml_response['result']['project']:
            projects += [{'project_id': project['id'], 'url_name': project['url_name']}]
projects = list({v['project_id']: v for v in projects}.values())
project_ids = [p['project_id'] for p in projects]

  0%|          | 0/49999 [00:00<?, ?it/s]

In [271]:
projects_anonymized = anonymize(projects)

In [279]:
with open('./../data/projects/{}.json'.format(now()), 'w') as f:
    json.dump(projects, f)

In [260]:
with open('./../data/projects/project_ids.json', 'w') as f:
    json.dump(project_ids, f)

---

# Get all activity facts

In [37]:
with open('./../data/project_ids/project_ids.json', 'r') as f:
    project_ids = json.load(f)

In [38]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)
    
    activity_facts_futures = [session.get('https://www.openhub.net/projects/{}/analyses/latest/activity_facts.xml'.format(project_id), params={'api_key': random.choice(api_keys)}, timeout=timeout) for project_id in project_ids]
    
    for _ in tqdm(concurrent.futures.as_completed(activity_facts_futures), total=len(activity_facts_futures)):
        pass

  0%|          | 0/355111 [00:00<?, ?it/s]

In [286]:
activity_facts = load_from_futures(activity_facts_futures, 'activity_fact')

  0%|          | 0/355111 [00:00<?, ?it/s]

In [287]:
activity_facts_anonymized = anonymize(activity_facts)

In [288]:
with open('./../data/activity_facts/{}.json'.format(now()), 'w') as f:
    json.dump(activity_facts_anonymized, f)

---

# Get analyses

In [73]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)

    analyses_futures = [session.get('https://www.openhub.net/projects/{}/analyses/latest.xml'.format(project_id), params={'api_key': random.choice(api_keys)}, timeout=timeout) for project_id in project_ids]
    
    for _ in tqdm(concurrent.futures.as_completed(analyses_futures), total=len(analyses_futures)):
        pass

  0%|          | 0/355111 [00:00<?, ?it/s]

In [289]:
analyses = load_from_futures(analyses_futures, 'analysis')

  0%|          | 0/355111 [00:00<?, ?it/s]

In [290]:
analyses_anonymized = anonymize(analyses)

In [291]:
with open('./../data/analyses/{}.json'.format(now()), 'w') as f:
    json.dump(analyses_anonymized, f)

---

# Get HTML pages

In [49]:
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
    session.mount("https://", http_adapter)

    html_futures = [session.get('https://www.openhub.net/p/{}.html'.format(project_id), timeout=timeout) for project_id in project_ids]
    
    for _ in tqdm(concurrent.futures.as_completed(html_futures), total=len(html_futures)):
        pass

  0%|          | 0/355111 [00:00<?, ?it/s]

In [252]:
def parse_html(resp):
    html_string = resp.text
    project_id = resp.url.split('/')[4][:-5]
    
    links = set()
    html = lxml.html.fromstring(html_string)
    for alert_info in list(html.find_class('alert alert-info')):
        links |= set([link[2] for link in alert_info.iterlinks()])

    original_project_name = None
    code_available = True
    for link in links:
        if 'enlistments' in link:
            code_available = False
        else:
            original_project_name = link.split('/')[-1]
            
    return {'project_id': project_id, 'original_project_name': original_project_name, 'code_available': code_available}

In [292]:
html_pages = [parse_html(future.result()) for future in tqdm(html_futures)]

  0%|          | 0/355111 [00:00<?, ?it/s]

In [293]:
html_pages_anonymized = anonymize(html_pages)

In [294]:
with open('./../data/html_pages/{}.json'.format(now()), 'w') as f:
    json.dump(html_pages_anonymized, f)