In [59]:
from bs4 import BeautifulSoup
import requests
from fastai.core import parallel, partial
from collections import Counter

## Write Functions To Get Data

In [2]:
def find_max_issue_num(owner, repo):
    """
    Find the maximum issue number associated with a repo.
    
    Returns
    -------
    int
        the highest issue number associated with this repo.
    """
    url = f'https://github.com/{owner}/{repo}/issues'
    r = requests.get(url)
    if not r.ok:
        r.raise_for_status()
    soup = BeautifulSoup(r.content)
    # get grey text under issue preview cards
    issue_meta = soup.find('span', class_="opened-by").text
    # parse the first issue number visible, which is also the highest issue number
    issue_num = issue_meta.strip().split('\n')[0][1:]
    return int(issue_num)

In [3]:
def verify_issue(owner, repo, num):
    """
    Verify that owner/repo/issues/num exists.  
    
    Returns
    -------
    bool
        True/False if issue exists.
    
    Note that pull requests are also issues but will 
    get redirected with a status code 302, allowing
    this function to return False.
    """
    
    url = f'https://github.com/{owner}/{repo}/issues/{num}'
    
    if requests.head(url).status_code != 200:
        return False
    else:
        return True

In [4]:
def get_issue_text(num, idx, owner, repo, skip_issue=True):
    """
    Get the raw text of an issue body and label.
    
    Returns
    ------
    dict
        {'title':str, 'body':str}
    """
    url = f'https://github.com/{owner}/{repo}/issues/{num}'
    if not verify_issue(owner, repo, num):
        if skip_issue:
            return None
        raise Exception(f'{url} is not an issue.')
        
    soup = BeautifulSoup(requests.get(url).content)
    title_find = soup.find("span", class_="js-issue-title")
    body_find = soup.find("td", class_="js-comment-body")
    label_find = soup.find(class_='js-issue-labels')
    
    if not title_find or not body_find:
        return None
    
    title = title_find.get_text().strip()
    body = body_find.get_text().strip()
    labels = label_find.get_text().strip().split('\n')
    
    if labels[0] == 'None yet':
        return None
    
    return {'title':title,
            'url':url,
            'body': body,
            'labels': labels}

In [81]:
def get_all_issue_text(owner, repo, inf_wrapper, workers=32, min_freq=25):
    c = Counter()
    # prepare list of issue nums
    owner=owner
    repo=repo
    max_num = find_max_issue_num(owner, repo)
    
    get = partial(get_issue_text, owner=owner, repo=repo, skip_issue=True)
    issues = parallel(get, list(range(1, max_num+1)), max_workers=workers)
    # filter out issues with problems
    filtered_issues = []
    
    for issue in issues:
        if issue:
            c.update(issue['labels'])
            filtered_issues.append(issue)
    
    frequent_issues = [x for x in c if c[x] >= min_freq]
    
    print(f'Retrieved {len(filtered_issues)} issues.')
    
    # only retain top n issues
    final_issues = []
    for issue in tqdm_notebook(filtered_issues):
        lbls = [i for i in issue['labels'] if i in frequent_issues]
        if lbls:
            issue['labels'] = lbls
            text = inf_wrapper.process_dict(issue)['text']
            issue['features'] = inf_wrapper.get_pooled_features(text)
            final_issues.append(issue)
            
    print(f'{len(final_issues)} issues remaining after minimum frequency filter of {min_freq}.')
    
    return final_issues

## Get The Data

In [74]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from inference import InferenceWrapper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
from pathlib import Path
from urllib import request as request_url

def pass_through(x):
    return x

# model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
path = Path('./model_files')
full_path = path/'model.pkl'

# if not full_path.exists():
#     print('Loading model.')
#     path.mkdir(exist_ok=True)
#     request_url.urlretrieve(model_url, path/'model.pkl') 
inference_wrapper = InferenceWrapper(model_path=path, model_file_name='model.pkl')

In [None]:
test = get_all_issue_text(owner='kubeflow', repo='examples', inf_wrapper=inference_wrapper)

Retrieved 206 issues.


HBox(children=(IntProgress(value=0, max=206), HTML(value='')))