In [None]:
url_tmpl = "https://issue.pbsbiotech.com/projects/%s/issues.csv?utf8=%%E2%%9C%%93&columns=all"
_p_urls = [
    "pbscustomer", "pbsdisposables", "pbsinstruments", 
    "magic-metals", "manufacturing", "pbssoftware", "swtesting",
    "system-qualification-testing"
]
project_urls = [url_tmpl % p for p in _p_urls]
project_urls

In [444]:
""" Issuetracker API 

* TODO: Create IssueList class (?)
* Parse Gantt HTML for class 'issue-subject' using style:width to determine hierarchy
* Consider method of lazy evaluation of issue field generation by
calling back to API to download project issues CSV, and update all issues
in project. 
* Implement issue caching

Issue():
    * Add programmatic logging of all fields seen, ever.
    * Map fields seen to types and conversion functions

"""

import requests
import urllib
import pyquery
from collections import OrderedDict
import re
import dateutil.parser
import lxml

uj = urllib.parse.urljoin
_sp_re = re.compile(r"(\d*?) (subproject)?(s{0,1})")
_name2id_re = re.compile(r"(.*?)\s*?#(\d*)$")

class IssuetrackerAPI():
    _login_url = "/login"
    _proj_issues_url = "/projects/%s/issues"
    _issues_url = "/issues"
    _proj_url = "/projects"
    
    def __init__(self, base_url, username=None, pw=None):
        r = urllib.parse.urlparse(base_url)
        if not r.scheme and not r.netloc:
            base_url = urllib.parse.urlunparse(("https", r.path, "", r.params, r.query, r.fragment))
        self._base_url = base_url
        self._sess = requests.Session()
        self._headers = {}
        
        if username is None or pw is None:
            raise ValueError("Must have valid username and password.")
        
        self._username = username
        self._password = pw
        self._auth = (username, pw)
        
        self._cache = {}
        
    @property
    def issues(self):
        raise NotImplemented
        
    @property
    def projects(self):
        pj = self._cache.get("projects")
        if not pj:
            pj = self.download_projects()
            self._cache['projects'] = pj
        return pj
            
    def login(self):
        r1 = self._sess.get(self._base_url)
        r1.raise_for_status()
        q = pyquery.PyQuery(r1.content)
        data = {}
        for td in q("#login-form :input"):
            at = td.attrib
            if 'name' in at:
                k = at['name']
                v = at.get('value', "")
                data[k] = v
                
        data['username'] = self._username
        data['password'] = self._password
        
        body = urllib.parse.urlencode(data)
        r2 = self._sess.post(uj(self._base_url, self._login_url), body)
        r2.raise_for_status()
        if not pyquery.PyQuery(r2.content)("#loggedas"):
            raise ValueError("Invalid Username or Password")
        return r2
        
    def download_project_issues_csv(self, project, utf8=True, columns='all'):
        r = self._download_project_csv(project, utf8, columns)
        return self._parse_proj_csv(r.content)
    
    def _download_project_csv(self, project, utf8, columns):
        if utf8:
            utf8 = "%E2%9C%93"
        else:
            utf8 = ""
        url_end = ".csv?utf8=%s&columns=%s" 
        url = (self._proj_issues_url + url_end) % (project, utf8, columns)
        url = uj(self._base_url, url)
        r = self._sess.get(url)
        r.raise_for_status()
        return r
    
    def download_issue_pdf(self, id):
        href = self._issues_url + "/" + str(id)
        return self.download_issue_pdf2(href)

    def download_issue_pdf2(self, href):
        """ Sometimes it is more convenient to access issue by provided 
        href. """
        type = ".pdf"
        url = uj(self._base_url, href + type)
        r = self._sess.get(url)
        r.raise_for_status()
        return r.content
        
    def _parse_proj_csv(self, csv, encoding='utf-8'):
        if not isinstance(csv, str):
            csv = csv.decode(encoding)
        sl = csv.splitlines()
        sl[0] = sl[0].lower().replace('"', "")
        lines = [l.split(",") for l in sl]
        issues = OrderedDict()
        for i, l in enumerate(lines[1:], 1):
            issue = _Issue(line=sl[i], api=self)
            for key, val in zip(lines[0], l):
                issue[key] = val.strip('"') or "<n/a>"
            issue['#'] = int(issue['#'])
            issues[issue['#']] = issue
        return issues
    
#     def download_projects(self):
#         url = uj(self._base_url, self._proj_url)
#         r = self._sess.get(url)
#         r.raise_for_status()
#         c = r.content
#         q = pyquery.PyQuery(c)
#         q2 = q("#projects-index > [class='projects root']")
#         projects = _Project(self, "All", "")
#         for e in q2.children(".root"):
#             proj_ele = pyquery.PyQuery(e).children(".root > a")[0]
#             pt = proj_ele.text
#             phref = proj_ele.attrib['href'].split("/")[-1]
#             proj = projects.add(pt, phref)
#             q4 = pyquery.PyQuery(e).children("[class='more collapsed']")
#             if len(q4) and _sp_re.match(q4[0].text):
#                 q3 = pyquery.PyQuery(e)("[class='projects ']")
#                 for e2 in q3(".child > .child > a"):
#                     proj.add(e2.text, e2.attrib['href'].split("/")[-1])
#         return projects

    def download_projects(self):
        url = uj(self._base_url, self._proj_url + ".xml")
        r = self._sess.get(url, auth=self._auth)
        r.raise_for_status()
        xml = lxml.etree.XML(r.content)
        projects = {}
        for proj in xml.findall("project"):
            p = Project.from_element(self, proj)
            projects[p.name] = p
        
        # Second pass, process project subtasks
        for p in projects.values():
            if p.parent is not None:
                parent = projects[p.parent['name']]
                parent.add_subproject(p)
        return projects
    
    def _download_gantt_raw(self, project):
        url = (self._proj_issues_url % project) + "/gantt"
        url = uj(self._base_url, url)
        r1 = self._sess.get(url)
        r1.raise_for_status()
        return r1.content

    def download_gantt(self, project):
        c = _download_gantt_raw(self, project)
        q = pyquery.PyQuery(c)
        q2 = q(".gantt_subjects")
        issues = []
        for el in q2.children(".issue-subject"):
            title = el.attrib['title']
            e2=pyquery.PyQuery(el).children("span > a")[0]
            tracker, id = _name2id_re.match(e2.text).groups()
            href = e2.attrib['href']
            i = _Issue(api=self)
            i.href = href
            i.title = title
            i.id = int(id)
            i.tracker = tracker
            issues.append(i)
        return issues
    
    def _download_project_issues_iter(self, ops, limit, offset):
        ops['limit'] = limit
        ops['offset'] = offset
        url = uj(self._base_url, self._issues_url + ".json")
        url += "?" + urllib.parse.urlencode(ops)
        r = self._sess.get(url, auth=self._auth)
        r.raise_for_status()
        return r

    def download_issues(self, project_id=None, created_on=None, modified_on=None):
        ops = {}
        if project_id:
            if isinstance(project_id, str):
                projects = self.projects
                for p in projects.values():
                    if p.name == project_id or p.identifier == project_id:
                        break
                else:
                    raise ValueError("Unable to find project %r" % project_id)
                project_id = p.id
            ops['project_id'] = project_id
            
        # Unfortunately the api for querying dates and ranges is 
        # quite awkward to translate into a sensible python api
        
        if created_on:
            if not isinstance(created_on, str):
                raise TypeError("Argument created_on must be type str- try .isoformat() (got type %r)" % created_on)
            ops['created_on'] = created_on
            
        if modified_on:
            if not isinstance(modified_on, str):
                raise TypeError("Argument created_on must be type str- try .isoformat() (got type %r)" % modified_on)
            ops['modified_on'] = modified_on
            
        yield from self._download_issues(ops)
            

    def _download_issues(self, ops):
        offset = 0
        limit = 100
        limit = min(max(limit, 0), 100)
        total_count = 0
        while True:

            r = self._download_project_issues_iter(ops, limit, offset)
            d = json.loads(r.content.decode())
            issues = d['issues']

            if not issues:
                break

            yield from self._parse_issues(issues)

            total_count = int(d.get('total_count',0))
            offset += len(issues)
            print("Downloading issues: %d/%d" % (offset, total_count))
            if offset >= total_count:
                break

    def _parse_issues(self, issues):
        rv = {}
        for i in issues:
            yield Issue.from_json(self, **i)
    

def _parse_custom_fields(e):
    rv = {}
    for cf in e.findall("custom_field"):
        cfd = {}
        cfd.update(cf.attrib)
        v = cf.find("value")
        if v is None or v.text == 'blank':
            val = None
        else:
            val = v.text
        cfd['value'] = val
        rv[cfd['name']] = cfd
    return rv

def _parse_datetime(e):
    return dateutil.parser.parse(e.text)

def _parse_int(e):
    return int(e.text)

def _parse_bool(e):
    t = e.text.lower()
    if t == 'false':
        return False
    return True

def _parse_parent(e):
    return {k:v for k,v in e.attrib.items()}


class Project():
    def __init__(self, api, id=0, name="", identifier="", description="", parent=None, status=None, 
                 is_public=False, custom_fields=None, created_on=None, updated_on=None):
        self._api = api
        self.name = name
        self.id = id
        self.name = name
        self.identifier = identifier
        self.description = description
        self.parent = parent
        self.status = status
        self.is_public = is_public
        self.custom_fields = custom_fields
        self.created_on = created_on
        self.updated_on = updated_on
        self._subprojects = []
        
        
    def add_subproject(self, sp):
        sp.parent = self
        if sp not in self._subprojects:
            self._subprojects.append(sp)
            
    def __repr__(self):
        return "_Project(%s)" % ', '.join("%s=%r" % (k[0], getattr(self, k[0])) for k in self._proj_parse_table)
    
    def download_issues(self, utf8=True, columns='all'):
        return self._api.download_project_issues(self.identifier, utf8, columns)
    
    def download_gantt(self):
        return self._api.download_gantt(self.identifier)
        
    _proj_parse_table = [
        # e.tag attr parse function
        ("id", "id", _parse_int),
        ("name", "name", None),
        ("identifier", "identifier", None),
        ("description", "description", None),
        ("parent", "parent", _parse_parent),
        ("status", "status", _parse_int),
        ("is_public", "is_public", _parse_bool),
        ("custom_fields", "custom_fields", _parse_custom_fields),
        ("created_on", "created_on", _parse_datetime),
        ("updated_on", "updated_on", _parse_datetime),
    ]
        
    @classmethod
    def from_element(cls, api, e):
        kw = {}
        for tag, k, func in cls._proj_parse_table:
            el = e.find(tag)
            if el is None:
                continue
            if func:
                v = func(el)
            else:
                v = el.text
            if v is not None:
                kw[k] = v
        if not kw and e.tag != 'project':
            raise ValueError("Failed to parse element: element should be <project> element.")
        return cls(api, **kw)
        

In [426]:
i = IssuetrackerAPI("issue.pbsbiotech.com", 'nstarkweather', 'kookychemist')
r2=i.login()
try:
    jd
except NameError:
    jd = {'issue':{}}
#issues = i.download_project_issues('pbssoftware')
#iss = next(_ for _ in issues.values())

In [None]:
import json
jd2 = json.loads(i._sess.get("https://issue.pbsbiotech.com/issues/2206.json", auth=i._auth).content.decode())
jd['issue'].update(jd2['issue'])
for f in jd2['issue']['custom_fields']:
    jd['issue'][f['name']] = f['value']
#print(jd2['issue']['custom_fields'])
#print(json.dumps(jd, indent=4))


In [None]:
#raw=i._download_project_csv('pbssoftware', True, 'all')
r = raw.content.decode().splitlines()[0].split(",")
r = [_.replace('"', "") for _ in r]
r[0] = 'id'
r.sort(key=str.lower)
r[0] = 'done_ratio'
r.sort(key=str.lower)

js = sorted(jd['issue'].keys(), key=lambda s: s.lower())
from itertools import zip_longest
print("   R               JS")
for a, b in zip_longest(r, js, fillvalue=""):
    print("%-30s %-30s"%(a,b))

In [433]:
d=i.download_issues('pbssoftware', ">=2016-08-10")
def iter5(ob):
    rv=[];ap=rv.append
    for _ in range(5):
        ap(next(ob,None))
    return rv
import collections; exhaust = collections.deque(maxlen=0).extend
l = list(d)
import datetime
d2 = datetime.datetime(2016, 8, 10).date()
for item in l:
    co = item.created_on.date()
    mark = "<" if co < d2 else ">" if co > d2 else "="
    print("Issue #%d" % item.id, item.created_on, mark, d2)

Downloading issues: 31/31
Issue #3270 2016-08-30 17:02:40+00:00 > 2016-08-10
Issue #3269 2016-08-30 16:56:23+00:00 > 2016-08-10
Issue #3268 2016-08-29 20:56:07+00:00 > 2016-08-10
Issue #3267 2016-08-29 17:21:04+00:00 > 2016-08-10
Issue #3266 2016-08-24 17:11:13+00:00 > 2016-08-10
Issue #3264 2016-08-23 17:54:45+00:00 > 2016-08-10
Issue #3261 2016-08-19 18:41:28+00:00 > 2016-08-10
Issue #3260 2016-08-19 18:34:35+00:00 > 2016-08-10
Issue #3259 2016-08-18 22:23:38+00:00 > 2016-08-10
Issue #3258 2016-08-18 22:05:43+00:00 > 2016-08-10
Issue #3255 2016-08-17 23:58:28+00:00 > 2016-08-10
Issue #3254 2016-08-17 22:36:36+00:00 > 2016-08-10
Issue #3253 2016-08-17 21:45:29+00:00 > 2016-08-10
Issue #3252 2016-08-17 19:04:50+00:00 > 2016-08-10
Issue #3251 2016-08-17 18:59:37+00:00 > 2016-08-10
Issue #3250 2016-08-16 23:47:08+00:00 > 2016-08-10
Issue #3249 2016-08-16 23:15:42+00:00 > 2016-08-10
Issue #3248 2016-08-16 23:11:04+00:00 > 2016-08-10
Issue #3247 2016-08-16 22:57:57+00:00 > 2016-08-10
Issue

In [424]:
l[0].description = ""
l[0].custom_fields = []
l[0]

Issue(author=User(name='James Small', id=42), custom_fields=[], sprint_milestone=ResourceWithID(name='Future Release', id=33), category=None, status=ResourceWithID(name='New', id=1), company=None, created_on=datetime.datetime(2016, 8, 30, 17, 2, 40, tzinfo=tzutc()), description='', subject='List all interlocks in Shell UI', done_ratio=None, crm_reply_token=None, updated_on=datetime.datetime(2016, 8, 30, 17, 2, 40, tzinfo=tzutc()), id=3270, project=_Project(id=5, name='Software', identifier='pbssoftware', description='This project collects all issues and features in regards to software', parent=None, status=1, is_public=False, custom_fields={'Customer Information': {'name': 'Customer Information', 'id': '2', 'value': None}}, created_on=datetime.datetime(2010, 10, 14, 10, 55, 55, tzinfo=tzutc()), updated_on=datetime.datetime(2010, 10, 14, 11, 7, 58, tzinfo=tzutc())), contact=None, priority=ResourceWithID(name='Normal', id=4), due_date=None, estimated_hours=None, tracker=ResourceWithID(na

In [448]:
def _unrecognized_kw(kw):
    return ValueError("Unrecognized keywords: %s" % (', '.join(repr(s) for s in kw)))

def _iss_parse_datetime(api, a, v):
    return dateutil.parser.parse(v)

def _iss_parse_int(api, a, v):
    return int(v)

def _iss_parse_usr(api, a, v):
    name = v.pop('name')
    id = v.pop('id')
    if v:
        raise _unrecognized_kw(v)
    return User(api, name, id)

def _iss_parse_resource(api, a, v):
    name = v.pop('name')
    id = v.pop('id')
    value = v.pop('value', "")
    if v:
        raise _unrecognized_kw(v)
    return ResourceWithID(api, name, id, value)

def _iss_parse_project(api, a, v):
    return api.projects[v['name']]

def _parse_custom_fields(api, a, v):
    fields = {}
    for d in v:
        name = d.pop('name')
        id = d.pop('id')
        val = d.pop('value', "")
        if d:
            raise _unrecognized_kw(d)
        r = ResourceWithID(api, name, id, val)
        fields[name] = val
    return fields


class ResourceWithID():
    def __init__(self, api, name, id, value=""):
        self.api = api
        self.name = name
        self.id = id
        self.value = value
        
    def __repr__(self):
        n = self.__class__.__name__
        args = ', '.join("%s=%r" % (a, getattr(self, a)) for a in ("name", 'id'))
        return "%s(%s)" % (n, args)


class User(ResourceWithID):
    def __init__(self, api, name, id):
        super().__init__(api, name, id)
        del self.value


class Issue():
    
    _issue_parse_tbl = [
        ("author", "author", _iss_parse_usr),
        ("custom_fields", "custom_fields", _parse_custom_fields),
        ("fixed_version", "sprint_milestone", _iss_parse_resource),  # oddly named. TODO double check this
        ("category", "category", None),
        ("status", "status", _iss_parse_resource),
        ("company", "company", None),
        ("created_on", "created_on", _iss_parse_datetime),
        ("description", "description", None),
        ("subject", "subject", None),
        ("done_ratio", "done_ratio", None),
        ("crm_reply_token", "crm_reply_token", None),
        ("updated_on", "updated_on", _iss_parse_datetime),
        ("id", "id", _iss_parse_int),
        ("project", "project", _iss_parse_project),
        ("contact", "contact", None),
        ("priority", "priority", _iss_parse_resource),
        ("due_date", "due_date", _iss_parse_datetime),
        ("estimated_hours", "estimated_hours", None),
        ("tracker", "tracker", _iss_parse_resource),
        ("parent", "parent", None),
        ("closed_on", "closed_on", _iss_parse_datetime),
        ("start_date", "start_date", _iss_parse_datetime),
        ("tracking_uri", "tracking_uri", None),
        ("assigned_to", "assigned_to", _iss_parse_usr)
    ]

    def __init__(self, api, author=None, custom_fields=None, sprint_milestone=None, category=None, status=None, 
                  company=None, created_on=None, description=None, subject=None, done_ratio=None, crm_reply_token=None, 
                  updated_on=None, id=None, project=None, contact=None, priority=None, due_date=None, estimated_hours=None, 
                  tracker=None, parent=None, closed_on=None, start_date=None, tracking_uri=None, assigned_to=None):
        
        self._api = api
        
        self.author = author
        self.custom_fields = custom_fields
        self.sprint_milestone = sprint_milestone  
        self.category = category
        self.status = status
        self.company = company
        self.created_on = created_on
        self.description = description
        self.subject = subject
        self.done_ratio = done_ratio
        self.crm_reply_token = crm_reply_token
        self.updated_on = updated_on
        self.id = id
        self.project = project
        self.contact = contact
        self.priority = priority
        self.due_date = due_date
        self.estimated_hours = estimated_hours
        self.tracker = tracker
        self.parent = parent
        self.closed_on = closed_on
        self.start_date = start_date
        self.tracking_uri = tracking_uri
        self.assigned_to = assigned_to

    @classmethod
    def from_json(cls, api, **kw):
        dct = {}
        absent = object()
        for k, attr, func in cls._issue_parse_tbl:
            v = kw.pop(k, absent)
            if v is absent or not v:
                continue
            if func:
                v = func(api, attr, v)
            dct[attr] = v
        if kw:
            raise _unrecognized_kw(kw)
        return cls(api, **dct)

    def pretty_print(self):
        attrs = [t[1] for t in self._issue_parse_tbl]
        args = ", ".join("%s=%r" % (a, getattr(self, a)) for a in attrs)
        if not args: args = '<empty>'
        return "Issue(%s)" % args

    def download(self, type='pdf'):
        return self._api.download_issue_pdf(self.id)
    
    __str__ = __repr__ = pretty_print


In [None]:
import threading
import queue

def download_worker(url, q, user, pw):
    i = IssuetrackerAPI(url, user, pw)
    i.login()
    while True:
        iss = q.get(True)
        if iss is None:
            return
        pdf = iss.download()
        write(iss.id, pdf)
        
def write(id, pdf):
    fn = "pdfs2/%d.pdf" % id
    with open(fn, 'wb') as f:
        f.write(pdf)
        
def download_issues(issues, filter_cb=lambda i: True):
    issues = [i for i in issues if filter_cb(i)]
    threads = []
    url = issues[0]._api._base_url.replace("https://", "")  # todo urllib.parse.urlparse().domain or w/e
    user = issues[0]._api._username
    pw = issues[0]._api._password
    q = queue.Queue()
    threads = []
    for i in range(8):
        t = threading.Thread(None, download_worker, args=(url, q, user, pw), daemon=True)
        threads.append(t)
    for t in threads:
        t.start()
    for i in issues:
        q.put(i)
    for i in range(8):
        q.put(None)
    while True:
        if not any(t.is_alive() for t in threads):
            break
        print("\rDownloading multithreaded %d/%d           " % (len(issues) - (q.qsize()), len(issues)), end="")
        import time
        time.sleep(.5)
    print("Done")
        

i = IssuetrackerAPI('issue.pbsbiotech.com', 'nstarkweather', 'kookychemist')
i.login()
issues=i.download_issues('pbssoftware', modified_on=">=2016-6-01")
import os, shutil
shutil.rmtree("pdfs2")
os.makedirs("pdfs2", exist_ok=True)

def filter_cb(iss):
    return iss.sprint_milestone.name == "3.0" and iss.tracker.name == "Specification"

download_issues(issues, filter_cb)

# for n, iss in enumerate(issues, 1):
#     fn = "pdfs/%d.pdf" % iss.id
#     pdf = iss.download()
#     print("\rDownloading: %d / %d       " % (n, len(issues)), end="")
#     with open(fn, 'wb') as f:
#         f.write(pdf)

Downloading issues: 100/670
Downloading issues: 200/670
Downloading issues: 300/670
Downloading issues: 400/670
Downloading issues: 500/670
Downloading issues: 600/670
Downloading issues: 670/670
Downloading multithreaded 20/109           

In [457]:
import threading
threading.Thread.is_alive()

TypeError: is_alive() missing 1 required positional argument: 'self'

In [461]:
import queue
queue.Queue().qsize()

0