© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

# Prepare StaQC

This notebook contains the code to create a snippet collection from the original StaQC dataset:
- We convert the original pickle files in a jsonl format consistent with the codesearch library. 
- We filter out code snippets that can not be parsed.

### First we convert the Python 2 pickle files to a jsonl file
Note that this part requires a python 2 kernel

In [1]:
import ntpath
import cPickle as pickle
import traceback
from collections import defaultdict

In [2]:
DATASET_URLS = {
    "python-multi-code": 'https://github.com/LittleYUYU/StackOverflow-Question-Code-Dataset/raw/master/annotation_tool/data/code_solution_labeled_data/source/python_how_to_do_it_by_classifier_multiple_iid_to_code.pickle',
    "python-multi-descr": "https://github.com/LittleYUYU/StackOverflow-Question-Code-Dataset/raw/master/annotation_tool/data/code_solution_labeled_data/source/python_how_to_do_it_by_classifier_multiple_qid_to_title.pickle",
    "python-single-code": "https://github.com/LittleYUYU/StackOverflow-Question-Code-Dataset/raw/master/annotation_tool/data/code_solution_labeled_data/source/python_how_to_do_it_qid_by_classifier_unlabeled_single_code_answer_qid_to_code.pickle",
    "python-single-descr": 'https://github.com/LittleYUYU/StackOverflow-Question-Code-Dataset/raw/master/annotation_tool/data/code_solution_labeled_data/source/python_how_to_do_it_qid_by_classifier_unlabeled_single_code_answer_qid_to_title.pickle'
}


def pickle_filename(url):
    return ntpath.basename(url)

def load_pickle(url):
    with open(pickle_filename(url), "rb") as f:
        return pickle.load(f)
    

datasets = {}

for dataset, url in DATASET_URLS.items():
    !wget -nc $url
    datasets[dataset] = load_pickle(url)
    

File ‘python_how_to_do_it_by_classifier_multiple_qid_to_title.pickle’ already there; not retrieving.

File ‘python_how_to_do_it_qid_by_classifier_unlabeled_single_code_answer_qid_to_code.pickle’ already there; not retrieving.

File ‘python_how_to_do_it_qid_by_classifier_unlabeled_single_code_answer_qid_to_title.pickle’ already there; not retrieving.

File ‘python_how_to_do_it_by_classifier_multiple_iid_to_code.pickle’ already there; not retrieving.



In [3]:
id2snippets = defaultdict(list)

description_ds = dict(datasets["python-multi-descr"])
description_ds.update(datasets["python-single-descr"])

records = []
for snippet_dsname in ["python-multi-code", "python-single-code"]:
    snippet_ds = datasets[snippet_dsname]
    for full_id, code in snippet_ds.items():
        if isinstance(full_id, tuple):
            so_id = full_id[0]  
            snippet_id = "%i_%i" % full_id
        else:
            so_id = full_id
            snippet_id = str(full_id)
        description = description_ds[so_id]
        attribution = "https://stackoverflow.com/questions/%i" % so_id
        record = {"rawDescription": description, 
                  "code": code, 
                  "attribution": attribution,
                  "language": "python",
                  "id": snippet_id
                }
        records.append(record)

In [4]:
records[-3:]

[{'attribution': 'https://stackoverflow.com/questions/22850218',
  'code': u'words = set(message.split(" "))\nresult = [int(word in words) for word in terms]\n',
  'id': '22850218',
  'language': 'python',
  'rawDescription': u'Create Array of found words in Python'},
 {'attribution': 'https://stackoverflow.com/questions/7602174',
  'code': u'sort input-file.txt | uniq -u -w 3\n',
  'id': '7602174',
  'language': 'python',
  'rawDescription': u'How to only print lines with unique fields?'},
 {'attribution': 'https://stackoverflow.com/questions/10048853',
  'code': u'from Tkinter import *\n\nclass MyDialog:\n\n    def __init__(self, parent):\n\n        top = self.top = Toplevel(parent)\n\n        Label(top, text="Value").pack()\n\n        self.e = Entry(top)\n        self.e.pack(padx=5)\n\n        b = Button(top, text="OK", command=self.ok)\n        b.pack(pady=5)\n\n    def ok(self):\n\n        print "value is", self.e.get()\n\n        self.top.destroy()\n\n\nroot = Tk()\nButton(root, 

In [5]:
import json

staqc_raw = "staqc-py-raw.jsonl"

with open(staqc_raw, "wb") as f:
    for r in records:
        r = json.dumps(r)
        f.write(r)
        f.write("\n")
        

### Now we will clean the code snippets and filter out unparseable code

We check if code is parseable in two steps. First, check which snippets parse in python3. Next, we use a python2 kernel to check if the unparseable snippets do parse in python2. We take the union of the snippets that parse in python2 and python3.

In [1]:
import re
import ast

def maybe_clean_prompt(code):
    prompt_patterns = r"^(>>+[ ])|(\.\.\.[ :])|(In[ ]?\[[0-9 ]*\]:[ ])"
    m = re.match(prompt_patterns, code)
    if m:
        lines = []
        for l in code.split("\n"):
            m = re.match(prompt_patterns, l)
            if not m: continue
            prefix = m.group()
            lines.append(l[len(prefix):])
        code = "\n".join(lines)
        return True, code
    return False, code


def split_parseable(snippets):
    parseable = []
    unparseable = []
    for s in snippets:
        code = s["code"]
        cleaned, code = maybe_clean_prompt(code)
        s["code"] = code
        try:
            ast.parse(code)
            parseable.append(s)
        except:
            unparseable.append(s)
    return parseable, unparseable

In [87]:
!head sta

In [5]:
from codesearch.text_preprocessing import clean_how_to

i = 0
for s in raw_snippets:
    code = s["code"]
    cleaned, cleaned_code = maybe_clean_prompt(code)
    description = s["rawDescription"]
    cleaned_description = clean_how_to(description)
    if cleaned and description != cleaned_description:
        print(s)
        print(cleaned_description)
        print(cleaned_code)
        i += 1
    if i == 10: break


{'code': ">>> [l[i:i+2] for i in range(0, len(l), 2)]\n[['verb', \n  '\\n\\n1. reading, blah, blah (to read a book with the intent of learning)\\n2. blah blah blah (second definition of study)\\n\\n'], \n ['noun', \n  '\\n\\n1. blah blah blah (the object of ones study)\\n2. yadda yadda yadda (second definition of study)']]\n\n>>> l = [l[i:i+2] for i in range(0, len(l), 2)]\n", 'attribution': 'https://stackoverflow.com/questions/34134408', 'language': 'python', 'rawDescription': 'how to make regex go line by line to match two strings at the same time?', 'id': '34134408_1'}
Make regex go line by line to match two strings at the same time
[l[i:i+2] for i in range(0, len(l), 2)]
l = [l[i:i+2] for i in range(0, len(l), 2)]
{'code': 'In [14]: df.reindex(["Z", "C", "A"])\nOut[14]:\ncompany  Amazon  Apple  Yahoo\nZ             0      0    150\nC           173      0      0\nA             0    130      0\n', 'attribution': 'https://stackoverflow.com/questions/30009948', 'language': 'python', 'r

#### Python 3

In [2]:
from codesearch.data import load_jsonl
staqc_raw = "staqc-py-raw.jsonl"
raw_snippets = load_jsonl(staqc_raw)

In [3]:
parseable_py3, unparseable_py3 = split_parseable(raw_snippets)

In [4]:
len(parseable_py3), len(unparseable_py3)

(173011, 99318)

In [5]:
import json

staqc_py3_parseable = "staqc-snippets-py3-parseable.jsonl"
staqc_py3_unparseable = "staqc-snippets-py3-unparseable.jsonl"

with open(staqc_py3_parseable, "w") as f:
    for r in parseable_py3:
        r = json.dumps(r)
        f.write(r)
        f.write("\n")
        
with open(staqc_py3_unparseable, "w") as f:
    for r in unparseable_py3:
        r = json.dumps(r)
        f.write(r)
        f.write("\n")


### Python 2

In [2]:
import json
staqc_py3_unparseable = "staqc-snippets-py3-unparseable.jsonl"

snippets = []
with open(staqc_py3_unparseable, "rb") as f:
    for l in f:
        snippets.append(json.loads(l[:-1]))

len(snippets), snippets[:2]

(99318,
 [{u'attribution': u'https://stackoverflow.com/questions/19612419',
   u'code': u"def rotate(*args):\n    print 'rotate button press...'\n    theta = 90\n    rotated = ndimage.rotate(image, theta)\n    im.set_data(rotated)\n    canvas.draw()\n",
   u'id': u'19612419_0',
   u'language': u'python',
   u'rawDescription': u'updating matplotlib imshow from within a Tkinter gui'},
  {u'attribution': u'https://stackoverflow.com/questions/34773625',
   u'code': u'class Report(object):\n  .\n  .\n  def new_hosts(self):\n      """Return a list of new hosts added in latest scan"""\n      return self.curr_hosts - self.prev_hosts\n',
   u'id': u'34773625_0',
   u'language': u'python',
   u'rawDescription': u'How to print notification to slack by calling a function via python'}])

In [3]:
parseable_py2, unparseable = split_parseable(snippets)

len(parseable_py2), len(unparseable)

(30689, 68629)

In [4]:
staqc_py2_parseable = "staqc-snippets-py2-parseable.jsonl"

with open(staqc_py2_parseable, "wb") as f:
    for r in parseable_py2:
        r = json.dumps(r)
        f.write(r)
        f.write("\n")


### Here we can use python3 again

We will merge the parseable snippets and clean the descriptions

In [1]:
from codesearch.data import load_jsonl
staqc_py3_parseable = "staqc-snippets-py3-parseable.jsonl"
staqc_py2_parseable = "staqc-snippets-py2-parseable.jsonl"

snippets = load_jsonl(staqc_py3_parseable)
snippets.extend(load_jsonl(staqc_py2_parseable))

len(snippets)

203700

In [2]:
from codesearch.text_preprocessing import clean_how_to
import re

how_to_pattern = "^([hH]ow to |[hH]ow do ([Ii] |you )|[Hh]ow does one |([tT]he )?[Bb]est way to |([Hh]ow )?[Cc]an (you |[Ii] ))"

def clean_how_to(t):
    t = re.sub(how_to_pattern, "", t)
    if t.endswith("?"):
        t = t[:-1]
    return t[0].capitalize() + t[1:]

for s in snippets:
    s["description"] = clean_how_to(s["rawDescription"])

In [3]:
i = 0
for s in snippets:
    
    if s["description"] != s["rawDescription"]:
        print(s["rawDescription"])
        print(s["description"])
        print()
        i += 1
        if i == 10: break

How can I verify my selfsigned certificate when using easywebdav?
Verify my selfsigned certificate when using easywebdav

How to break down a number with another from list?
Break down a number with another from list

How to do write a Python script that inputs all files from a certain subdirectory from command line?
Do write a Python script that inputs all files from a certain subdirectory from command line

txredisapi subscribe and listen async
Txredisapi subscribe and listen async

how to make regex go line by line to match two strings at the same time?
Make regex go line by line to match two strings at the same time

How to reorder indexed rows based on a list in Pandas data frame
Reorder indexed rows based on a list in Pandas data frame

What hash algorithm does Python's dictionary mapping use?
What hash algorithm does Python's dictionary mapping use

python list comprehension get dictionary by key
Python list comprehension get dictionary by key

How to slice and extend a 2D numpy 

In [4]:
import json

staqc_cleaned = "staqc-py-cleaned.jsonl"

with open(staqc_cleaned, "w") as f:
    for r in snippets:
        r = json.dumps(r)
        f.write(r)
        f.write("\n")


### Evaluation datasets

Next, we create evaluation datasets from stack overflow duplicates. 

In [37]:
from codesearch.data import load_train_dataset

duplicates = load_train_dataset("so-duplicates-feb20")
len(duplicates), duplicates[:2]

(195498,
 [{'original': ['73713',
    "How do I check for nulls in an '==' operator overload without infinite recursion?"],
   'duplicates': [['86947',
     'Best way to handle null when writing equals operator'],
    ['58388750',
     "How to compare two objects of different types where one inherits the other's type"],
    ['4219261', 'Overriding == operator. How to compare to null?'],
    ['4867909',
     "When overloading the equality operator, what's the best way to handle null values?"],
    ['1042147', '(C#) Problems when overloading the == operator'],
    ['14428218', 'How can i implement == and check for null in c#'],
    ['9059085', 'C# equality operators override (== and !=)'],
    ['39681387', 'Why Use Value Equality On a Reference Type'],
    ['39877764',
     'Overriding Equals/GetHashCode for class in order to use hashset Contains/ExceptWith/UnionWith'],
    ['52767648', 'How can I ignore an operator overload'],
    ['24762789', 'how to avoid stackoverflow in == overload'

In [62]:
id2orig = {}
orig2dupls = {}
for r in duplicates:
    orig_id = r["original"][0]
    ids = [t[0] for t in r["duplicates"]]
    orig2dupls[orig_id] = r["duplicates"]
    for id in ids:
        id2orig[id] = orig_id
    id2orig[orig_id] = orig_id

len(id2orig), list(id2orig.items())[:2], len(orig2dupls), list(orig2dupls.items())[:2]

(737274,
 [('86947', '73713'), ('58388750', '73713')],
 195498,
 [('73713',
   [['86947', 'Best way to handle null when writing equals operator'],
    ['58388750',
     "How to compare two objects of different types where one inherits the other's type"],
    ['4219261', 'Overriding == operator. How to compare to null?'],
    ['4867909',
     "When overloading the equality operator, what's the best way to handle null values?"],
    ['1042147', '(C#) Problems when overloading the == operator'],
    ['14428218', 'How can i implement == and check for null in c#'],
    ['9059085', 'C# equality operators override (== and !=)'],
    ['39681387', 'Why Use Value Equality On a Reference Type'],
    ['39877764',
     'Overriding Equals/GetHashCode for class in order to use hashset Contains/ExceptWith/UnionWith'],
    ['52767648', 'How can I ignore an operator overload'],
    ['24762789', 'how to avoid stackoverflow in == overload'],
    ['26775303', 'Operator overloading giving error'],
    ['104

In [38]:
id2snippets = defaultdict(list)
for s in snippets:
    so_id = s["id"].split("_")[0]
    id2snippets[so_id].append(s["id"])
len(id2snippets)

128550

In [61]:
from itertools import groupby
from operator import itemgetter

ids_in_duplicates = [(id, id2orig[id]) for id in id2snippets if id in id2orig]
ids_in_duplicates = sorted(ids_in_duplicates, key=itemgetter(1))
groups = groupby(ids_in_duplicates, key=itemgetter(1))
ids_by_duplicate_group = [[item[0] for item in data] for (key, data) in groups]
len(ids_in_duplicates), len(ids_by_duplicate_group), ids_by_duplicate_group[:10]

(10261,
 6282,
 [['18126552', '19463598', '100003', '17801344'],
  ['16881955'],
  ['10012534', '16496733'],
  ['10012788'],
  ['1001538'],
  ['1001634'],
  ['10017086'],
  ['10017147'],
  ['36220375', '100210', '4557577', '4541629'],
  ['10021749']])

In [70]:
eval_data = []


for ids_duplicate_group in ids_by_duplicate_group:
    orig_ids = set(id2orig[id] for id in ids_duplicate_group)
    assert(len(orig_ids) == 1)
    orig_id = orig_ids.pop()
    all_duplicates = orig2dupls[orig_id]
    query = None
    for dupl_id, description in all_duplicates:
        if dupl_id not in ids_duplicate_group:
            query = description
            break
    if query:
        relevant_ids = [ snippet_id for so_id in ids_duplicate_group for snippet_id in id2snippets[so_id]]
        eval_record = {"query": query, "relevant_ids": relevant_ids}
        eval_data.append(eval_record)


random.seed(123)          
random.shuffle(eval_data)
len(eval_data), len(eval_data)/2, eval_data[:30]

(5497,
 2748.5,
 [{'query': 'Django - Switch language setting for template rendering',
   'relevant_ids': ['5258715']},
  {'query': 'How to extract all fields minus a couple from a dictionary?',
   'relevant_ids': ['8717395_1', '8717395_0']},
  {'query': 'Extra backslash gets added when reading a path',
   'relevant_ids': ['26903155_1', '26903155_0']},
  {'query': 'How do you generate all possible permutations in python?',
   'relevant_ids': ['11463237']},
  {'query': 'Pandas Column names to list - correct method',
   'relevant_ids': ['19482970']},
  {'query': 'Python - connect to S3 with profile only?',
   'relevant_ids': ['33378422_0', '33378422_1']},
  {'query': 'How to mix asyncio code in with blocking code?',
   'relevant_ids': ['28492103']},
  {'query': 'How to calculated R2 and R2 adjusted via poly_fit numpy, pandas',
   'relevant_ids': ['893657_1', '893657_0']},
  {'query': 'Python sort using key and lambda, what does lambda do?',
   'relevant_ids': ['26877806']},
  {'query': '

In [72]:
# sanity check, should not print any id
snippet_ids = set(s["id"] for s in snippets)

for r in eval_data:
    for sid in r["relevant_ids"]:
        if sid not in snippet_ids:
            print(sid)



In [73]:
eval_data_raw = 'staqc-py-eval-raw.jsonl'

with open(eval_data_raw, "w") as f:
    for r in eval_data:
        f.write(json.dumps(r))
        f.write("\n")

In [74]:
test_data_raw = 'staqc-py-test-raw.jsonl'
valid_data_raw = 'staqc-py-valid-raw.jsonl'

!head -n 2749 $eval_data_raw > $test_data_raw
!tail -n 2748 $eval_data_raw > $valid_data_raw

### TODO: Manually clean the test data

In [None]:
from IPython.display import Markdown
from IPython.display import clear_output

def display_query_snippet(idx, total, query, snippet):
    descr = snippet["description"].replace('\n', '\\n')
    code = snippet["code"]
    id = snippet["id"]
    mkdown = f"{idx}/{total}<br>**query**: {query}\n\n**description**: {descr}\n```python\n{code}\n\n\n\n```"
    return Markdown(mkdown)


In [None]:
def filter_eval_data(eval_data, queries_to_skip, shuffle=True):
    for i, r in enumerate(eval_data):
        if r["query"] in queries_to_skip:
            continue
        relevant_ids = []
        for relevant_id in r["relevant_ids"]:
            is_relevant = "?"
            while is_relevant == "?":
                clear_output(wait=True)
                display(display_query_snippet(i + 1, len(eval_data), r["query"], id2snippets[relevant_id]))
                input_str = input("relevant y/n: ")
                if input_str in ["y", "Y", "1"]:
                    is_relevant = True
                elif input_str in ["n", "N", "0"]:
                    is_relevant = False
                elif input_str in ["q", "Q"]:
                    return
                else:
                    is_relevant = "?"
            if is_relevant:
                relevant_ids.append(relevant_id)
        if relevant_ids:
            record = dict(r)
            record["relevant_ids"] = relevant_ids
            yield record


In [None]:
from pathlib import Path
import os
from codesearch.data import load_jsonl

test_data_raw_ = load_jsonl(test_data_raw)
test_data_filtered = 'staqc-py-test-cleaned.jsonl'
if Path(test_data_filtered).exists():
    overwrite = input(f"{test_data_filtered} already exist, do you want to overwrite the file y/n:")
    if overwrite not in ["y", "Y"]:
        records = load_jsonl(test_data_filtered)
        queries = set(r["query"] for r in records)
    else:
        queries = set()
        os.remove(test_data_filtered)
    
with open(test_data_filtered, "a") as f:
    for r in filter_eval_data(test_data_raw_, queries):
        f.write(json.dumps(r))
        f.write("\n")

In [18]:
from codesearch.data import load_eval_dataset, load_snippet_collection

id2snippet = {r["id"]:r for r in load_snippet_collection("staqc-py-cleaned")}
_, query2id = load_eval_dataset("staqc-py-raw-valid")
i = 0
for q in query2id:
    if q != "How much space does my python program use?": continue
    ids = query2id[q]
    print(q)
    print([id2snippet[id]["description"] for id in ids])
    print([id2snippet[id]["code"] for id in ids][0])

    i += 1
    if i == 1: break

How much space does my python program use?
['Total memory used by Python process', 'Total memory used by Python process']
import os
_proc_status = '/proc/%d/status' % os.getpid()

_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
          'KB': 1024.0, 'MB': 1024.0*1024.0}

def _VmB(VmKey):
    '''Private.
    '''
    global _proc_status, _scale
     # get pseudo file  /proc/<pid>/status
    try:
        t = open(_proc_status)
        v = t.read()
        t.close()
    except:
        return 0.0  # non-Linux?
     # get VmKey line e.g. 'VmRSS:  9999  kB\n ...'
    i = v.index(VmKey)
    v = v[i:].split(None, 3)  # whitespace
    if len(v) < 3:
        return 0.0  # invalid format?
     # convert Vm value to bytes
    return float(v[1]) * _scale[v[2]]


def memory(since=0.0):
    '''Return memory usage in bytes.
    '''
    return _VmB('VmSize:') - since


def resident(since=0.0):
    '''Return resident memory usage in bytes.
    '''
    return _VmB('VmRSS:') - since


def stacksize(since=0