In [1]:
import ast
import sqlite3
import json
import re
import numpy as np
import pickle
from collections import defaultdict
from exceptions import IndentationError
from HTMLParser import HTMLParser
from tokenize import generate_tokens
from cStringIO import StringIO
from astor import to_source

In [2]:
sqlite_file = 'store.db'
conn = sqlite3.connect(sqlite_file)

In [3]:
def query(sql):
    return conn.cursor().execute(sql).fetchall()

In [4]:
titles = {post_id: title for post_id, title in query('select id, title from post where title is not null')}

In [41]:
accepted_posts = {post_id: accepted_answer_id for post_id, accepted_answer_id in query('select id, accepted_answer_id from post where accepted_answer_id is not null')}

In [5]:
posts = {post_id: body for post_id, body in conn.cursor().execute('select id, body from post').fetchall()}

In [6]:
questions = defaultdict(list)
for post_id, parent_id, score in conn.cursor().execute('select id, parent_id, score from post where parent_id is not null').fetchall():
    questions[parent_id].append((score, post_id))

In [7]:
annotations = [(post_id, json.loads(annotation_json)) for post_id, annotation_json in query('select post_id, annotation_json from annotation')]

In [8]:
confident_annotations = [(post_id, annotation) for post_id, annotation in annotations if not annotation['notSure']]

In [9]:
annotated_questions = {}
for post_id, _ in confident_annotations:
    annotated_questions[post_id] = set(sorted(questions[post_id], key=lambda x:-x[0])[:3])

In [10]:
def get_code_span(html, match):
    start, end = match.span()
    code = match.group(1)
    start += html[start:end].find(code)
    end = start + len(code)
    return (start, end)

In [11]:
def get_code_spans(html, is_code):
    if not is_code:
        return [(0, len(html))]
    matches = re.finditer(r'<code[^>]*>((?:\s|[^<]|<span[^>]*>[^<]+</span>)*)</code>', html)
    return [get_code_span(html, m) for m in matches]

In [12]:
def merge_spans(html, code_spans, sel_spans):
    masks = np.zeros(len(html))
    for start, end in code_spans:
        masks[start:end] += 1.
    for start, end in sel_spans:
        masks[start:end] += 1.
    masks = masks == 2
    for i in range(1, len(html)):
        if html[i].isspace() and masks[i - 1]:
            masks[i] = True
    for i in reversed(range(len(html) - 2)):
        if html[i].isspace() and masks[i + 1]:
            masks[i] = True
    for start, end in code_spans:
        code = [c for c, m in zip(html[start:end], masks[start:end]) if m]
        if len(code) > 0:
            yield ''.join(code)

In [13]:
def parse_range(post_id, selected_range, is_code):
    source, source_id = selected_range['source'].split('-')
    source_id = int(source_id)
    if source == 'title':
        text = titles[source_id]
    else:
        text = posts[source_id]
    start, end = selected_range['start'], selected_range['end']
    return text, start, end

In [14]:
def parse_selection(post_id, selection, is_code):
    ref_text = selection['html']
    sel_spans = []
    source_text = None
    for selected_range in selection['pos']:
        text, start, end = parse_range(post_id, selected_range, is_code)
        if source_text is None:
            source_text = text
        else:
            assert source_text == text
        sel_spans.append((start, end))
    sel_text = '\n'.join(merge_spans(source_text, get_code_spans(source_text, is_code), sel_spans))
    return source_text, sel_text, re.sub('<[^<]+?>', '', ref_text.strip())

In [15]:
def parse_selections(post_id, selections, is_code=True):
    if selections is None:
        return []
    return [parse_selection(post_id, s, is_code) for s in selections]

In [16]:
def parse_annotation(post_id, annotation):
    return {
        'post_id': post_id,
        'intent': parse_selections(post_id, annotation['question'], is_code=False),
        'context': parse_selections(post_id, annotation['context']),
        'snippet': parse_selections(post_id, annotation['snippet']),
    }

In [17]:
parsed_confident_annotations = [parse_annotation(post_id, a) for post_id, a in confident_annotations]

In [18]:
def unescape(text, parser=HTMLParser()):
    return parser.unescape(text)

In [19]:
def get_code_list(html_list, is_code=True):
    for html in html_list:
        for start, end in get_code_spans(html, is_code):
            yield unescape(html[start:end])

In [20]:
final_annotations = []
count = 0
for a in parsed_confident_annotations:
    aa = {
        'post_id': a['post_id'],
        'intent_ref': '\n'.join(unescape(text) for _, text, _ in a['intent']),
        'context_ref': '\n'.join(unescape(text) for _, text, _ in a['context']),
        'snippet_ref': '\n'.join(unescape(text) for _, text, _ in a['snippet']),
        'intent_text': set(get_code_list((text for text, _, _ in a['intent']), False)),
        'context_text': set(get_code_list(text for text, _, _ in a['context'])),
        'snippet_text': set(get_code_list(text for text, _, _ in a['snippet'])),
    }
    if aa['context_ref'] == u'ort itertools\n&g\nt2d = [[1,2,3],[4,5,6], [7], [8,9]]\n&g':
        aa['context_ref'] = u'import itertools\nlist2d = [[1,2,3],[4,5,6], [7], [8,9]]\n'
    if not aa['snippet_ref']:
        print a
        print '-------------------------'
        count += 1
    else:
        final_annotations.append(aa)
count

{'snippet': [(u'<p>The pythonic way to do it is from the <a href="https://www.python.org/dev/peps/pep-0008" rel="nofollow noreferrer">PEP 8 style guide</a>:</p>\n\n<blockquote>\n  <p>For sequences, (strings, lists, tuples), use the fact that empty sequences are false.\n  </p>\n\n<pre><code><b>Yes:</b> if not seq:\n     if seq:\n\n<b>No:</b>  if len(seq):\n     if not len(seq):\n</code></pre>\n</blockquote>\n', '', u'if not seq:')], 'post_id': 53513, 'intent': [(u'Best way to check if a list is empty', u' check if a list is empty', u'check if a list is empty')], 'context': []}
-------------------------


1

In [48]:
posts[accepted_id]

u'<p>You have your slash backwards, it should be <code>"\\n"</code></p>\n'

In [59]:
baseline = {}
for post_id in [a['post_id'] for a in final_annotations]:
    if post_id not in accepted_posts:
        continue
    accepted_id = accepted_posts[post_id]
    code_set = set(get_code_list([posts[accepted_id]]))
    if len(code_set) == 1:
        code = normalize_code(code_set.pop())
        if code is not None:
            baseline[post_id] = code

\n
--------------------
<type 'exceptions.SyntaxError'>
unexpected character after line continuation character (<unknown>, line 1)


In [60]:
pickle.dump(baseline, open('baseline.p', 'wb'))

In [21]:
for a in final_annotations:
    if a['snippet_ref'].strip():
        pass
    else:
        print a

In [22]:
def from_console(code, prompts=[' >>>', '  >>> ', '>>> ', '... ', '$ ']):
    for line in code.split('\n'):
        for p in prompts:
            if line.startswith(p):
                return True
    return False

In [23]:
def console_extract(code, prompts=[' >>>', '  >>> ', '>>> ', '... ', '$ ']):
    lines = []
    for line in code.split('\n'):
        for p in prompts:
            if line.startswith(p):
                lines.append(line[len(p):])
                break
    return '\n'.join(lines)

In [24]:
def from_ipython(code, patterns=[re.compile(r'In \[\d+\]: '), re.compile(r'In \[\d+\]:')]):
    for line in code.split('\n'):
        for p in patterns:
            match = p.match(line)
            if match:
                return True
    return False

In [25]:
def ipython_extract(code, patterns=[re.compile(r'In \[\d+\]: '), re.compile(r'   \.\.\.\: '), re.compile(r'In \[\d+\]:'), re.compile(r'   \.\.\.\:')]):
    lines = []
    for line in code.split('\n'):
        for p in patterns:
            match = p.match(line)
            if match:
                lines.append(line[match.end():])
                break
    return '\n'.join(lines)

In [26]:
def remove_comment(code):
    lines = code.split('\n')
    for i, line in enumerate(lines):
        try:
            for toknum, tokval, (_, start), _, _  in generate_tokens(StringIO(line).readline):
                if toknum == 53:
                    lines[i] = line[:start]
        except:
            pass
    return '\n'.join(lines)

In [27]:
def remove_indents(code):
    lines = [line for line in code.split('\n') if line.strip()]
    if not lines:
        return code
    indent_length, example = min((len(line) - len(line.lstrip()), line) for line in lines)
    indent = example[:indent_length]
    for i, line in enumerate(lines):
        if not line.startswith(indent):
            return code
        lines[i] = line[indent_length:]
    return '\n'.join(lines)

In [28]:
def add_pass(code):
    striped_code = code.rstrip()
    if striped_code and striped_code[-1] == ':':
        return striped_code + 'pass'
    return code

In [29]:
def normalize_code(code):
    if from_console(code):
        code = console_extract(code)
    elif from_ipython(code):
        code = ipython_extract(code)
    code = remove_comment(code)
    code = remove_indents(code)
    code = add_pass(code)
    if 'print(' in code and 'print_function' not in code:
        code = 'from __future__ import print_function\n' + code
    try:
        return to_source(ast.parse(code))
    except Exception as ex:
        print code
        print '--------------------'
        print type(ex)
        print ex
        print '===================='
    return None

In [30]:
passed_count = 0
failed_count = 0
falied_list = []
for i, a in enumerate(final_annotations):
    #for code in a['context_text'] | a['snippet_text']:
    context = normalize_code(a['context_ref'])
    snippet = normalize_code(a['snippet_ref'])
    code = None
    if context is not None and snippet is not None:
        code = normalize_code(context + '\n' + snippet)
    if code is not None:
        passed_count += 1
    else:
        print a['post_id']
        falied_list.append(a)
        failed_count += 1

if x in l
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
next(x for x in lst if ...)
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
next((x for x in lst if ...), None)
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
next((i for i, x in enumerate(lst) if [condition on x]), [default value])
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
{**x, **y}
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
38987
arrays = []
parser = csv.reader(open(your_file), delimiter=' '))
for l in parser: 
    arrays.append(np.array((array.float(i) for i in l)))
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 2)
6213336
parser = csv.reader(open(your_file), delimiter=' '))
make_array = lambda row : np.array((array.float(i) for i in row)) 
arrays 

In [31]:
passed_count, failed_count

(262, 9)

In [32]:
pickle.dump(final_annotations, open('annotations.p', 'wb'))

In [33]:
filter_out_questions = {9542738, 38987, 6213336}

In [34]:
final_annotations = [a for a in final_annotations if a['post_id'] not in filter_out_questions]

In [35]:
len(final_annotations)

250

In [36]:
for a in final_annotations:
    a['context_ref'] = normalize_code(a['context_ref'])
    a['snippet_ref'] = normalize_code(a['snippet_ref'])
    a['intent_ref'] = a['intent_ref'].strip()

In [37]:
final_questions = {}
for a in final_annotations:
    post_id = a['post_id']
    if post_id not in final_questions:
        answers = zip(*(sorted(questions[post_id], key=lambda x:-x[0])[:3]))[-1]
        final_questions[post_id] = {
            'intent': titles[post_id],
            'snippet':  set(get_code_list(posts[aid] for aid in answers))
        }

In [38]:
for q in final_questions.values():
    normalized_snippet = set()
    for s in q['snippet']:
        ss = normalize_code(s)
        if ss is not None:
            normalized_snippet.add(ss)
        else:
            normalized_snippet.add(s)
    q['snippet'] = normalized_snippet

from __future__ import print_function
Command to parse                      isFloat?   Note
------------------------------------  --------   --------------------------------
print(isfloat(""))                    False      Blank string
print(isfloat("127"))                 True       Passed string
print(isfloat(True))                  True       Pure sweet Truth
print(isfloat("True"))                False      Vile contemptible lie
print(isfloat(False))                 True       So false it becomes true
print(isfloat("123.456"))             True       Decimal
print(isfloat("      -127    "))      True       Spaces trimmed
print(isfloat("\t\n12\r\n"))          True       whitespace ignored
print(isfloat("NaN"))                 True       Not a number
print(isfloat("NaNanananaBATMAN"))    False      I am Batman
print(isfloat("-iNF"))                True       Negative infinity
print(isfloat("123.E4"))              True       Exponential notation
print(isfloat(".1"))                  Tru

Counter():  [6.360648187146579, 6.613881559699756, 6.392260466851987]
count():    [12.885062765334006, 13.045601897769359, 12.87746743077426]
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
mapcount : 0.465599966049
simplecount : 0.756399965286
bufcount : 0.546800041199
opcount : 0.718600034714
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
\n
--------------------
<type 'exceptions.SyntaxError'>
unexpected character after line continuation character (<unknown>, line 1)
mapcount : 0.471799945831
simplecount : 0.634400033951
bufcount : 0.468800067902
opcount : 0.602999973297
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
+
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
if not li: ...
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
[begin:end:step]
--------------------
<type 'exceptions.Sy

In [40]:
pickle.dump(final_questions, open('questions.p', 'wb'))
pickle.dump(final_annotations, open('annotations.p', 'wb'))