In [42]:

import pandas as pd
from pathlib import Path
import yaml
import json
import gzip

pd.options.display.float_format = '{:.3f}'.format

In [19]:
def gunzip_json(path: Path):
    with gzip.open(path, "rt") as f:
        return json.load(f)

class IgnoreUnknownTagLoader(yaml.SafeLoader):
    def ignore_unknown(self, suffix, node):
        if isinstance(node, yaml.MappingNode):
            return self.construct_mapping(node)
        elif isinstance(node, yaml.SequenceNode):
            return self.construct_sequence(node)
        else:
            return self.construct_scalar(node)

IgnoreUnknownTagLoader.add_multi_constructor('', IgnoreUnknownTagLoader.ignore_unknown)

def load_graph_yaml(path: Path):
    with path.open() as f:
        return yaml.load(f, Loader=IgnoreUnknownTagLoader)
    
yaml_data = [ load_graph_yaml(p) for p in Path("../tagging_clues/tagging_graphs_oct9_trimmed").glob("*.yaml") ]

In [44]:
tagged_prompts = [ ]
for data in yaml_data:
    for edge in data["edges"]:
        tagged_prompts.append({
            "prompt": edge["prompt_to"],
            "state": edge["state"],
            "username": edge["username"]
        })
tagged_prompts_df = pd.DataFrame(tagged_prompts)
tagged_prompts_df.head()


Unnamed: 0,prompt,state,username
0,"def topScores(lst):\n """"""\n if the strin...",neutral,student14
1,"def topScores(lst):\n """"""\n if the first...",fail,student14
2,"def topScores(lst):\n """"""\n For each lis...",neutral,student15
3,"def topScores(lst):\n """"""\n For each lis...",neutral,student15
4,"def topScores(lst):\n """"""\n For each lis...",neutral,student15


In [37]:
def read_multiple_prompts(path: Path):
    results = [ ] 
    for p in path.glob("*.results.json.gz"):
        data = gunzip_json(p)
        results.append({
            "prompt": data["prompt"],
            "username": data["username"],
            "n": len(data["results"]),
            "c": sum (1 if r["status"] == "OK" else 0 for r in data["results"])
        })
    return pd.DataFrame(results)

multiple_prompts_df = read_multiple_prompts(Path("../tmp/multiple"))
multiple_prompts_df.head()

Unnamed: 0,prompt,username,n,c
0,"def topScores(lst):\n """"""\n For each lis...",student15,200,0
1,"def remove_odd(lst):\n """"""\n Remove odd ...",student26,200,0
2,"def convert(lst):\n """"""\n takes a list o...",student0,200,0
3,"def total_bill(grocery_list, sales_tax):\n ...",student63,200,0
4,"def sortBySuccessRate(nominations):\n """"""\n...",student64,200,0


In [45]:
prompts_with_scores = pd.merge(tagged_prompts_df, multiple_prompts_df, on=["prompt", "username"])
prompts_with_scores["success_rate"] = prompts_with_scores["c"] / prompts_with_scores["n"]

In [50]:
prompts_with_scores.groupby("state").agg({"success_rate": "mean"}).reset_index()

Unnamed: 0,state,success_rate
0,fail,0.083
1,neutral,0.106
2,success,0.546
