Skip to content
This repository
branch: master
January 30, 2008
file 152 lines (106 sloc) 4.186 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
#!/usr/bin/env python
"""Grab URLs from the clipboard, interpret the queries as OpenID, and print.

In addition to URLs, I also scan for queries as they appear in httpd log files,
with a pattern like 'GET /foo?bar=baz HTTP'.

Requires the 'xsel' program to get the contents of the clipboard.
"""

from pprint import pformat
from urlparse import urlsplit, urlunsplit
import cgi, re, subprocess, sys

from openid import message

OPENID_SORT_ORDER = ['mode', 'identity', 'claimed_id']

class NoQuery(Exception):
    def __init__(self, url):
        self.url = url

    def __str__(self):
        return "No query in url %s" % (self.url,)


def getClipboard():
    xsel = subprocess.Popen(["xsel", "-o", "-b"], stdout=subprocess.PIPE)
    output = xsel.communicate()[0]
    return output


def main():
    source = getClipboard()
    urls = find_urls(source)

    errors = []
    output = []
    queries = []

    queries.extend(queriesFromPostdata(source))

    for url in urls:
        try:
            queries.append(queryFromURL(url))
        except NoQuery, err:
            errors.append(err)

    queries.extend(queriesFromLogs(source))

    for where, query in queries:
        output.append('at %s:\n%s' % (where, openidFromQuery(query)))

    if output:
        print '\n\n'.join(output)
    elif errors:
        for err in errors:
            print err


def queryFromURL(url):
    split_url = urlsplit(url)
    query = cgi.parse_qs(split_url[3])

    if not query:
        raise NoQuery(url)

    url_without_query = urlunsplit(split_url[:3] + (None, None))

    return (url_without_query, query)


def openidFromQuery(query):
    try:
        msg = message.Message.fromPostArgs(unlistify(query))
        s = formatOpenIDMessage(msg)
    except Exception, err:
        # XXX - side effect.
        sys.stderr.write(str(err))
        s = pformat(query)

    return s


def formatOpenIDMessage(msg):
    value_lists = {}
    for (ns_uri, ns_key), value in msg.args.items():
        l = value_lists.setdefault(ns_uri, {})
        l[ns_key] = value

    output = []

    for ns_uri, values in value_lists.items():
        ns_output = []

        alias = msg.namespaces.getAlias(ns_uri)
        if alias is message.NULL_NAMESPACE:
            alias = 'openid'
        ns_output.append(" %s <%s>" % (alias, ns_uri))

        for key in OPENID_SORT_ORDER:
            try:
                ns_output.append(" %s = %s" % (key, values.pop(key)))
            except KeyError:
                pass

        values = values.items()
        values.sort()

        for k, v in values:
            ns_output.append(" %s = %s" % (k, v))

        output.append('\n'.join(ns_output))

    return '\n\n'.join(output)


def unlistify(d):
    return dict((i[0], i[1][0]) for i in d.items())


def queriesFromLogs(s):
    qre = re.compile(r'GET (/.*)?\?(.+) HTTP')

    return [(match.group(1), cgi.parse_qs(match.group(2)))
            for match in qre.finditer(s)]

def queriesFromPostdata(s):
    # This looks for query data in a line that starts POSTDATA=.
    # Tamperdata outputs such lines. If there's a 'Host=' in that block,
    # use that too, but don't require it.
    qre = re.compile(r'(?:^Host=(?P<host>.+?)$.*?)?^POSTDATA=(?P<query>.*)$',
                     re.DOTALL | re.MULTILINE)
    return [(match.group('host') or 'POSTDATA',
             cgi.parse_qs(match.group('query'))) for match in qre.finditer(s)]

def find_urls(s):
    # Regular expression borrowed from urlscan
    # by Daniel Burrows <dburrows@debian.org>, GPL.
    urlinternalpattern=r'[{}a-zA-Z/\-_0-9%?&.=:;+,#~]'
    urltrailingpattern=r'[{}a-zA-Z/\-_0-9%&=+#]'
    httpurlpattern = r'(?:https?://' + urlinternalpattern + r'*' + urltrailingpattern + r')'
    # Used to guess that blah.blah.blah.TLD is a URL.
    tlds=['biz', 'com', 'edu', 'info', 'org']
    guessedurlpattern=r'(?:[a-zA-Z0-9_\-%]+(?:\.[a-zA-Z0-9_\-%]+)*\.(?:' + '|'.join(tlds) + '))'
    urlre = re.compile(r'(?:<(?:URL:)?)?(' + httpurlpattern + '|' + guessedurlpattern + '|(?:mailto:[a-zA-Z0-9\-_]*@[0-9a-zA-Z_\-.]*[0-9a-zA-Z_\-]))>?')

    return [match.group(1) for match in urlre.finditer(s)]


if __name__ == '__main__':
    main()
Something went wrong with that request. Please try again.