# BT4Py Type4Py Top-1

In [5]:
%load_ext autoreload
%autoreload 2

%pwd
%cd /home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/experiments

/home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/experiments


In [6]:
import polars as pl

pl.Config.set_fmt_str_lengths(300)
pl.Config.set_tbl_rows(n=50)

polars.config.Config

In [7]:
import pathlib

from scripts.common.schemas import TypeCollectionCategory
from scripts.infer.structure import DatasetFolderStructure

tool = "type4pyN1"
dataset = DatasetFolderStructure(pathlib.Path(
    "/home/benji/Documents/Uni/heidelberg/05/masterarbeit/datasets/better-types-4-py-dataset"
))


In [8]:
import logging
from importlib import reload

logging.shutdown()
reload(logging)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
for handler in logger.handlers:
    logger.removeHandler(handler)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

ch.setFormatter(logging.Formatter(f"[{tool} @ %(levelname)s]: %(message)s"))
logger.addHandler(ch)

logger.info("Hello World!")


[type4pyN1 @ INFO]: Hello World!


#  Prediction Metrics for Full Accuracy

In [2]:
# Constants
# Common Type Names
from typet5.model import ModelWrapper
model = ModelWrapper.load_from_hub("MrVPlusOne/TypeT5-v7")
common_names = model.common_type_names
del model

2023-06-26 11:00:57.436176: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [3]:
from scripts.common.schemas import TypeCollectionCategory

import pprint

# Because our analysis reviews more datapoints than these models actually regard, reuse TypeT5 metrics instead
def typet5_metrics_4_type4py(task: TypeCollectionCategory | str) -> None:
    from scripts.common.output import InferenceArtifactIO

    test_set = dataset.test_set()

    proj2datasets = [
        (
            project,
            InferenceArtifactIO(
                artifact_root=pathlib.Path(),
                dataset=dataset,
                repository=project,
                tool_name=tool,
                task=task,
            ),
        )
        for project in test_set
    ]
    existing = dict(
        (project, artifact)
        for project, artifact in proj2datasets
        if artifact.full_location().exists()
    )
    
    from importlib import reload

    from libcst import helpers as h
    import tqdm


    from typet5.experiments import type4py
    from typet5.static_analysis import PythonProject, SignatureMap, AccuracyMetric, SignatureErrorAnalysis
    from typet5.experiments.typet5 import accs_as_table_row
    from typet5.visualization import pretty_print_dict
    reload(type4py)

    assignments = []
    projects = []

    for ctr, (project, artifact) in tqdm.tqdm(enumerate(existing.items()), desc=f"Loading labels and predictions from {task}"):            
        type4py_predictions, = artifact.read()
        # if ctr < 1:
            # pprint.pprint(type4py_predictions)

        for file, predictions in type4py_predictions.items():
            modpkg = h.calculate_module_and_package(repo_root=project, filename=project / file)
            parser = type4py.Type4PyResponseParser(modpkg.name)
            assignments.append(parser.parse({"response": predictions}))

        projects.append(PythonProject.parse_from_root(project))


    name2project = {p.name: p for p in projects}
    
    label_signatures: dict[str, SignatureMap] = {
        project.name: {e.path: e.get_signature() for e in project.all_elems()}
        for project in projects
    }
    pred_signatures: dict[str, SignatureMap] = {n: dict() for n in label_signatures}

    module_srcs = [
        (project.name, name)
        for project in projects
        for name in project.modules
    ]
    for (pname, mname), o in zip(module_srcs, assignments):
        if isinstance(o, str):
            if list(name2project[pname].modules[mname].all_elements()):
                # only warn for non-empty modules
                logger.warning(f"In project {pname} module {mname}, Type4Py errored: {o}")
        else:
            pred_signatures[pname].update(o)

    # print(pred_signatures)

    eval_result = type4py.Type4PyEvalResult(
        pred_maps=pred_signatures,
        label_maps=label_signatures,
    )
    
    metrics = AccuracyMetric.default_metrics(common_type_names=common_names)
    # acc_metric = AccuracyMetric(common_type_names=ubiq_names)

    n_annots = sum([e.get_signature().n_annots() for p in projects for e in p.all_elems()])
    n_labels = sum([e.n_annotated() for lm in eval_result.label_maps.values() for e in lm.values()])
    
    logger.info(f"n_annots: {n_annots}, n_labels: {n_labels}")
    logger.info(f"Ratio: {n_labels / n_annots}")
    
    accs = {
        m.name: SignatureErrorAnalysis(
            eval_result.pred_maps,
            eval_result.label_maps,
            m,
            error_on_mismatched_signature=False,
        ).accuracies
        for m in metrics
    }
    accs_as_table_row(accs)
    # pretty_print_dict(accs)

In [22]:
typet5_metrics_4_type4py(task=TypeCollectionCategory.VARIABLE)

Loading labels and predictions from VARIABLE: 50it [04:36,  5.54s/it]
[type4pyN1 @ INFO]: n_annots: 30070, n_labels: 16520
[type4pyN1 @ INFO]: Ratio: 0.5493847688726305


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
42.48 & 46.62 & 48.49 & 29.27 & 48.79
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
60.78 & 63.01 & 64.80 & 45.48 & 61.35
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
1.20 & 19.04 & 20.67 & 5.13 & 22.09


In [23]:
typet5_metrics_4_type4py(task=TypeCollectionCategory.CALLABLE_RETURN)

Loading labels and predictions from CALLABLE_RETURN: 50it [04:47,  5.75s/it]
[type4pyN1 @ INFO]: n_annots: 30070, n_labels: 16520
[type4pyN1 @ INFO]: Ratio: 0.5493847688726305


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
42.48 & 46.69 & 48.59 & 29.26 & 48.87
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
60.79 & 63.14 & 64.95 & 45.62 & 61.49
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
1.20 & 19.03 & 20.69 & 5.05 & 22.08


In [9]:
typet5_metrics_4_type4py(task=TypeCollectionCategory.CALLABLE_PARAMETER)

Loading labels and predictions from CALLABLE_PARAMETER: 50it [05:06,  6.13s/it]
[type4pyN1 @ INFO]: n_annots: 30070, n_labels: 16520
[type4pyN1 @ INFO]: Ratio: 0.5493847688726305


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
42.48 & 46.69 & 48.59 & 29.26 & 48.87
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
60.79 & 63.14 & 64.95 & 45.62 & 61.49
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
1.20 & 19.03 & 20.69 & 5.05 & 22.08


In [11]:
typet5_metrics_4_type4py(task="all")

Loading labels and predictions from all: 50it [03:41,  4.44s/it]
[type4pyN1 @ INFO]: n_annots: 30070, n_labels: 16520
[type4pyN1 @ INFO]: Ratio: 0.5493847688726305


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
42.48 & 46.62 & 48.49 & 29.27 & 48.79
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
60.78 & 63.01 & 64.80 & 45.48 & 61.35
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
1.20 & 19.04 & 20.67 & 5.13 & 22.09
