In [1]:
import pickle
from tokenize import generate_tokens
from cStringIO import StringIO
from collections import defaultdict
import ast
import astor
from py2_tokenize import tokenize_code
import numpy as np
from scipy.stats.mstats import zscore

In [2]:
def sub_contiguous_snippets(code_snippet):
    try:
        tokens = [token for _, token, _, _, _  in generate_tokens(StringIO(code_snippet).readline) if token]
    except:
        return []
    sub_snippet_set = set()
    for i in range(len(tokens)):
        for j in range(i+1, len(tokens)+1):
            con_tokens = tokens[i:j]
            if con_tokens[-1] == ':':
                con_tokens.append('pass')
            sub_snippet = ' '.join(con_tokens)
            try:
                root = ast.parse(sub_snippet)
                cc = astor.to_source(root)
                tokenize_code(cc.encode('utf-8'))
                sub_snippet_set.add(cc)
            except:
                pass
    return sub_snippet_set

In [3]:
annotations = pickle.load(open('annotations.p', 'rb'))
questions = pickle.load(open('questions.p', 'rb'))
candidates = pickle.load(open('candidates.p', 'rb'))
intents = pickle.load(open('intents.p', 'rb'))
baseline = pickle.load(open('baseline.p', 'rb'))
bi_likelihood = pickle.load(open('bi_likelihood.p', 'rb'))

In [4]:
features = {}
for post_id, q in questions.items():
    intent = intents[post_id]
    features[post_id] = []
    for s in q['snippet']:
        abs_likelihood = {c: bi_likelihood[post_id][c] for c in sub_contiguous_snippets(s)}.items()
        if len(abs_likelihood) <= 1:
            continue
        c, ff = zip(*abs_likelihood)
        ff = np.hstack([np.array(ff), zscore(np.array(ff), axis=0)])
        features[post_id].extend(zip(c, ff))

In [5]:
def concat(context, snippet):
    return astor.to_source(ast.parse(context + '\n' + snippet))

In [6]:
context_pos = defaultdict(set)
snippet_pos = defaultdict(set)
full_pos = defaultdict(set)
for a in annotations:
    post_id = a['post_id']
    if a['context_ref'] in candidates[post_id]:
        context_pos[post_id].add(a['context_ref'])
    if a['snippet_ref'] in candidates[post_id]:
        snippet_pos[post_id].add(a['snippet_ref'])
    if a['snippet_ref'] == '':
        print a
    try:
        full = concat(a['context_ref'], a['snippet_ref'])
        if full in candidates[post_id]:
            full_pos[post_id].add(full)
    except:
        pass

In [7]:
def generate_x_y(pos_set):
    x = []
    y = []
    for post_id in features:
        for c, ff in features[post_id]:
            x.append(ff)
            y.append(c in pos_set[post_id])
    return np.array(x), np.array(y)

In [8]:
snippet_x, snippet_y = generate_x_y(snippet_pos)

In [9]:
context_x, context_y = generate_x_y(context_pos)

In [10]:
full_x, full_y = generate_x_y(full_pos)