In [1]:
import ast
import sqlite3
import json
import re
import numpy as np
import pickle
from collections import defaultdict
from exceptions import IndentationError
from HTMLParser import HTMLParser
from tokenize import generate_tokens
from cStringIO import StringIO
from astor import to_source

In [2]:
sqlite_file = 'store.db'
conn = sqlite3.connect(sqlite_file)

In [3]:
def query(sql):
    return conn.cursor().execute(sql).fetchall()

In [4]:
#get title for all posts
titles = {post_id: title for post_id, title in query('select id, title from post where title is not null')}
pickle.dump(titles, open('titles.p', 'wb'))

In [5]:
#get the postid of accepted answer for all posts
accepted_posts = {post_id: accepted_answer_id for post_id, accepted_answer_id in query('select id, accepted_answer_id from post where accepted_answer_id is not null')}

In [6]:
#get html_body for all posts
posts = {post_id: body for post_id, body in conn.cursor().execute('select id, body from post').fetchall()}
pickle.dump(posts, open('posts.p', 'wb'))

In [7]:
#get question id and voting score for all answer posts
questions = defaultdict(list)
for post_id, parent_id, score in conn.cursor().execute('select id, parent_id, score from post where parent_id is not null').fetchall():
    questions[parent_id].append((score, post_id))
pickle.dump(questions, open('questions.p', 'wb'))

In [8]:
#get all the user annotations (saved in json format), note that for one post there maybe multiple annotations
annotations = [(post_id, json.loads(annotation_json)) for post_id, annotation_json in query('select post_id, annotation_json from annotation')]

In [9]:
#filter out all the annotations which were marked as not_sure
confident_annotations = [(post_id, annotation) for post_id, annotation in annotations if not annotation['notSure']]

In [10]:
#since we only considered the top 3 answers, but in the database, we stored all the 
#answers for a post, here we just extract all the answer_id which were showed to user
annotated_questions = {}
for post_id, _ in confident_annotations:
    annotated_questions[post_id] = set(sorted(questions[post_id], key=lambda x:-x[0])[:3])

In [11]:
#get char-based offsets i.e (start_of_code_snippet, end_of_code_snippet) for all
#the code snippets inside a html body (post content)
def get_code_span(html, match):
    start, end = match.span()
    code = match.group(1)
    start += html[start:end].find(code)
    end = start + len(code)
    return (start, end)

def get_code_spans(html, is_code):
    if not is_code:
        return [(0, len(html))]
    matches = re.finditer(r"<pre[^>]*>[^<]*<code[^>]*>((?:\s|[^<]|<span[^>]*>[^<]+</span>)*)</code></pre>", html)
    return [get_code_span(html, m) for m in matches]

In [12]:
#get all the offsets of code selected by user, since the user selection might 
#include html tag or some content outside <code> tag. Here we just take the
#intersections of code_snippets and user_selection
def merge_spans(html, code_spans, sel_spans):
    masks = np.zeros(len(html))
    for start, end in code_spans:
        masks[start:end] += 1.
    for start, end in sel_spans:
        masks[start:end] += 1.
    masks = masks == 2
    for i in range(1, len(html)):
        if html[i].isspace() and masks[i - 1]:
            masks[i] = True
    for i in reversed(range(len(html) - 2)):
        if html[i].isspace() and masks[i + 1]:
            masks[i] = True
    for start, end in code_spans:
        code = [c for c, m in zip(html[start:end], masks[start:end]) if m]
        if len(code) > 0:
            yield ''.join(code)

In [13]:
#parse selection ranges to (html_text, start_offset, end_offset)
def parse_range(post_id, selected_range, is_code):
    source, source_id = selected_range['source'].split('-')
    source_id = int(source_id)
    if source == 'title':
        text = titles[source_id]
    else:
        text = posts[source_id]
    start, end = selected_range['start'], selected_range['end']
    return text, start, end

In [14]:
#parse annotation selection as (html_text, parsed_selected_text, saved_reference_text)
#when user annotated a post, saved_reference_text was saved from browser, which is the
#text selected by user (without any html tag), however, the user might mis-selected some
#text outside tht code-snippet, we only use it as an reference for sanity check, the ground
#truth is generated by parse_range
def parse_selection(post_id, selection, is_code):
    ref_text = selection['html']
    sel_spans = []
    source_text = None
    for selected_range in selection['pos']:
        text, start, end = parse_range(post_id, selected_range, is_code)
        if source_text is None:
            source_text = text
        else:
            assert source_text == text
        sel_spans.append((start, end))
    sel_text = '\n'.join(merge_spans(source_text, get_code_spans(source_text, is_code), sel_spans))
    return source_text, sel_text, re.sub('<[^<]+?>', '', ref_text.strip())

In [15]:
#parse multiple selection of a post as an array of (html_text, parsed_selected_text, saved_reference_text)
def parse_selections(post_id, selections, is_code=True):
    if selections is None:
        return []
    return [parse_selection(post_id, s, is_code) for s in selections]

In [16]:
#parse annotation record
def parse_annotation(post_id, annotation):
    return {
        'post_id': post_id,
        'intent': parse_selections(post_id, annotation['question'], is_code=False),
        'context': parse_selections(post_id, annotation['context']),
        'snippet': parse_selections(post_id, annotation['snippet']),
    }

In [17]:
#get all the confident annotation results
parsed_confident_annotations = [parse_annotation(post_id, a) for post_id, a in confident_annotations]

In [18]:
#unescape the html context (e.g. &amp => &)
def unescape(text, parser=HTMLParser()):
    return parser.unescape(text)

In [19]:
#get all the code snippet form a html context (extracting all the sub-text inside <code> tags)
#for future snippet-candidates generation 
def get_code_list(html_list, is_code=True):
    for html in html_list:
        for start, end in get_code_spans(html, is_code):
            yield unescape(html[start:end])

In [20]:
#parse all the annotation record in to a dict format, (*_ref ground truth), (*_text meta candidate)
# post_id: question id
# intent_ref: selected intent
# context_ref: selected context
# snippet_ref: selected snippet
# intent_text/context_text: set(all the code snippet extracted from html text) since currently
# we don't have any prior to differ intent and context, intent_text and context_text are the same

# due to tag completion (done by browser), the selected text recovered by saved offset might mis-match
# with the reference text, here we print such cases for manual examination.
final_annotations = []
count = 0
for a in parsed_confident_annotations:
    aa = {
        'post_id': a['post_id'],
        'intent_ref': '\n'.join(unescape(text) for _, text, _ in a['intent']),
        'context_ref': '\n'.join(unescape(text) if text.strip() == text_ref.strip() else unescape(text_ref) for _, text, text_ref in a['context']),
        'snippet_ref': '\n'.join(unescape(text) if text.strip() == text_ref.strip() else unescape(text_ref) for _, text, text_ref in a['snippet']),
        'intent_text': set(get_code_list((text for text, _, _ in a['intent']), False)),
        'context_text': set(get_code_list(text for text, _, _ in a['context'])),
        'snippet_text': set(get_code_list(text for text, _, _ in a['snippet'])),
    }
    for _, text, text_ref in a['snippet']:
        if text.strip() != text_ref.strip():
            print text
            print '-------------------------'
            print text_ref
            print '========================='
    for _, text, text_ref in a['context']:
        if text.strip() != text_ref.strip():
            print text
            print '-------------------------'
            print text_ref
            print '========================='
    if not aa['snippet_ref']:
        count += 1
    else:
        final_annotations.append(aa)
count


-------------------------

os.system("some_command with args")

-------------------------
stream = os.popen("some_command with args")

-------------------------
if not seq:

-------------------------
[i for i in old_list]

-------------------------
os.rmdir()

-------------------------
thefile = open('test.txt', 'w')

-------------------------
thefile = open('test.txt', 'w')

-------------------------
some_list[-1]
 import itertool
-------------------------
import itertools
t(itertools.chain.from_iterable(list2d))

-------------------------
list(itertools.chain.from_iterable(list2d))
ort itertools
&g
-------------------------
import itertools
t2d = [[1,2,3],[4,5,6], [7], [8,9]]
&g
-------------------------
list2d = [[1,2,3],[4,5,6], [7], [8,9]]

-------------------------
sorted(dict1, key=dict1.get)

-------------------------
sorted(d.items(), key=lambda x: x[1])

-------------------------
s[::-1]

-------------------------
''.join(reversed('foo'))

-------------------------
if x in l

0

In [21]:
#UW's baseline: extract the only code snippet from the accepted answer
baseline = {}
for post_id in posts:
    if post_id not in accepted_posts:
        continue
    accepted_id = accepted_posts[post_id]
    code_list = list(get_code_list([posts[accepted_id]]))
    if len(code_list) == 1:
        baseline[post_id] = code_list[0]
pickle.dump(baseline, open('baseline.p', 'wb'))

In [22]:
#if the code snippet was copied from an python REPL
def from_console(code, prompts=[' >>>', '  >>> ', '>>> ', '... ', '$ ']):
    for line in code.split('\n'):
        for p in prompts:
            if line.startswith(p):
                return True
    return False

In [23]:
#remove prompt prefixes from code
def console_extract(code, prompts=[' >>>', '  >>> ', '>>> ', '... ', '$ ']):
    lines = []
    for line in code.split('\n'):
        for p in prompts:
            if line.startswith(p):
                lines.append(line[len(p):])
                break
    return '\n'.join(lines)

In [24]:
#if the code snippet was copied from an ipython REPL
def from_ipython(code, patterns=[re.compile(r'In \[\d+\]: '), re.compile(r'In \[\d+\]:')]):
    for line in code.split('\n'):
        for p in patterns:
            match = p.match(line)
            if match:
                return True
    return False

In [25]:
#remove ipython prompt prefixes from code
def ipython_extract(code, patterns=[re.compile(r'In \[\d+\]: '), re.compile(r'   \.\.\.\: '), re.compile(r'In \[\d+\]:'), re.compile(r'   \.\.\.\:')]):
    lines = []
    for line in code.split('\n'):
        for p in patterns:
            match = p.match(line)
            if match:
                lines.append(line[match.end():])
                break
    return '\n'.join(lines)

In [26]:
#remove comments from code
def remove_comment(code):
    lines = code.split('\n')
    for i, line in enumerate(lines):
        try:
            for toknum, tokval, (_, start), _, _  in generate_tokens(StringIO(line).readline):
                if toknum == 53:
                    lines[i] = line[:start]
        except:
            pass
    return '\n'.join(lines)

In [27]:
#remove unnecessary indents from code, for example:
"""
    if a == b:
        print a
"""
# =>
"""
if a == b:
    print a
"""
def remove_indents(code):
    lines = [line for line in code.split('\n') if line.strip()]
    if not lines:
        return code
    indent_length, example = min((len(line) - len(line.lstrip()), line) for line in lines)
    indent = example[:indent_length]
    for i, line in enumerate(lines):
        if not line.startswith(indent):
            return code
        lines[i] = line[indent_length:]
    return '\n'.join(lines)

In [28]:
#add pass statement to complete for partial-snippet (e.g. if statement without then branch)
def add_pass(code):
    striped_code = code.rstrip()
    if striped_code and striped_code[-1] == ':':
        return striped_code + 'pass'
    return code

In [29]:
#normalize the code-snippet for exactly match
def normalize_code(code):
    if from_console(code):
        code = console_extract(code)
    elif from_ipython(code):
        code = ipython_extract(code)
    code = remove_comment(code)
    code = remove_indents(code)
    code = add_pass(code)
    # hack: parse python3-style print statement
    if 'print(' in code and 'print_function' not in code:
        code = 'from __future__ import print_function\n' + code
    try:
        return to_source(ast.parse(code))
    except Exception as ex:
        print code
        print '--------------------'
        print type(ex)
        print ex
        print '===================='
    return None

In [30]:
#normalize the code selection from annotation records
#note some of selection could not be normalized, this is because the original code have syntax error
#so we just skip these posts for now
passed_count = 0
failed_count = 0
falied_list = []
for i, a in enumerate(final_annotations):
    #for code in a['context_text'] | a['snippet_text']:
    context = normalize_code(a['context_ref'])
    snippet = normalize_code(a['snippet_ref'])
    code = None
    if context is not None and snippet is not None:
        code = normalize_code(context + '\n' + snippet)
    if code is not None:
        passed_count += 1
    else:
        print a['post_id']
        falied_list.append(a)
        failed_count += 1
print failed_count, passed_count

filter_out_questions = {a['post_id']for a in falied_list}
print filter_out_questions

final_annotations = [a for a in final_annotations if a['post_id'] not in filter_out_questions]

for a in final_annotations:
    a['context_ref'] = normalize_code(a['context_ref'])
    a['snippet_ref'] = normalize_code(a['snippet_ref'])
    a['intent_ref'] = a['intent_ref'].strip()

pickle.dump(final_annotations, open('annotations.p', 'wb'))

import itertools
 list2d = [[1,2,3],[4,5,6], [7], [8,9]]
--------------------
<type 'exceptions.IndentationError'>
unexpected indent (<unknown>, line 2)
952914
if x in l
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
next(x for x in lst if ...)
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
next((x for x in lst if ...), None)
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
next((i for i, x in enumerate(lst) if [condition on x]), [default value])
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
9542738
{**x, **y}
--------------------
<type 'exceptions.SyntaxError'>
invalid syntax (<unknown>, line 1)
38987
arrays = []
parser = csv.reader(open(your_file), delimiter=' '))
for l in parser: 
    arrays.append(np.array((array.float(i) for i in l)))
--------------------
<type 'exceptions.SyntaxError'>
invalid sy

In [31]:
#generated all the meta candidates (skip those)
final_questions = {}
for a in final_annotations:
    post_id = a['post_id']
    if post_id not in final_questions:
        answers = zip(*(sorted(questions[post_id], key=lambda x:-x[0])[:3]))[-1]
        final_questions[post_id] = {
            'intent': titles[post_id],
            'snippet':  set(get_code_list(posts[aid] for aid in answers))
        }
        
for q in final_questions.values():
    normalized_snippet = set()
    for s in q['snippet']:
        ss = normalize_code(s)
        if ss is not None:
            normalized_snippet.add(ss)
        else:
            normalized_snippet.add(s)
    q['snippet'] = normalized_snippet
    
pickle.dump(final_questions, open('questions.p', 'wb'))

from __future__ import print_function
Command to parse                      isFloat?   Note
------------------------------------  --------   --------------------------------
print(isfloat(""))                    False      Blank string
print(isfloat("127"))                 True       Passed string
print(isfloat(True))                  True       Pure sweet Truth
print(isfloat("True"))                False      Vile contemptible lie
print(isfloat(False))                 True       So false it becomes true
print(isfloat("123.456"))             True       Decimal
print(isfloat("      -127    "))      True       Spaces trimmed
print(isfloat("\t\n12\r\n"))          True       whitespace ignored
print(isfloat("NaN"))                 True       Not a number
print(isfloat("NaNanananaBATMAN"))    False      I am Batman
print(isfloat("-iNF"))                True       Negative infinity
print(isfloat("123.E4"))              True       Exponential notation
print(isfloat(".1"))                  Tru