# BT4Py TT5 Top-1

In [1]:
%load_ext autoreload
%autoreload 2

%pwd
%cd /home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/experiments

/home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/experiments


In [2]:
import polars as pl

pl.Config.set_fmt_str_lengths(300)
pl.Config.set_tbl_rows(n=50)

polars.config.Config

In [3]:
import pathlib

from scripts.common.schemas import TypeCollectionCategory
from scripts.infer.structure import DatasetFolderStructure

tool = "TypeT5TopN1"
dataset = DatasetFolderStructure(pathlib.Path(
    "/home/benji/Documents/Uni/heidelberg/05/masterarbeit/datasets/better-types-4-py-dataset"
))


2023-06-26 16:38:53.113191: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#  Prediction Metrics for Full Accuracy

In [8]:
# Because our analysis reviews more datapoints than these models actually regard, reuse TypeT5 metrics instead
import tqdm

from typet5.static_analysis import PythonProject, SignatureMap, AccuracyMetric, SignatureErrorAnalysis
from typet5.experiments.typet5 import accs_as_table_row
from typet5.visualization import pretty_print_dict

from scripts.common.output import InferenceArtifactIO

test_set = dataset.test_set()

projects = dict()
for project in (pbar := tqdm.tqdm(test_set, desc=f"Loading ground truths")):
    pbar.set_postfix({"project": str(project)})
    projects[project.name] = PythonProject.parse_from_root(project)

Loading ground truths: 100%|█| 50/50 [01:39<00:00,  1.99s/it, project=/home/benj


In [11]:
# Constants
# Common Type Names
from typet5.model import ModelWrapper
model = ModelWrapper.load_from_hub("MrVPlusOne/TypeT5-v7")
common_names = model.common_type_names
del model

from scripts.common.schemas import TypeCollectionCategory

# Because our analysis reviews more datapoints than these models actually regard, reuse TypeT5 metrics instead
from scripts.common.output import InferenceArtifactIO


def type_t5_metrics(task: TypeCollectionCategory) -> None:
    assignments = dict()

    for project in (pbar := tqdm.tqdm(test_set, desc=f"Loading predictions for {task}")):
        artifact = InferenceArtifactIO(
            artifact_root=pathlib.Path(),
            dataset=dataset,
            repository=project,
            tool_name=tool,
            task=task
        )
        pbar.set_postfix({"project": artifact.relative_location()})
        
        (tt5_predictions,) = artifact.read()
        assignments[project.name] = {key: pred[0] for key, pred in tt5_predictions.items()}

                
    label_signatures: dict[str, SignatureMap] = {
        project_name: {e.path: e.get_signature() for e in labels.all_elems()}
        for project_name, labels in projects.items()
    }
    # pred_signatures: dict[str, SignatureMap] = {n: dict() for n in label_signatures}
        
    pred_signatures = assignments 
    
    metrics = AccuracyMetric.default_metrics(common_type_names=common_names)
    # acc_metric = AccuracyMetric(common_type_names=ubiq_names)

    n_annots = sum([e.get_signature().n_annots() for _, p in projects.items() for e in p.all_elems()])
    n_labels = sum([e.n_annotated() for lm in label_signatures.values() for e in lm.values()])

    logger.info(f"n_annots: {n_annots}, n_labels: {n_labels}")
    logger.info(f"Ratio: {n_labels / n_annots}")

    accs = {
        m.name: SignatureErrorAnalysis(
            pred_signatures,
            label_signatures,
            m,
            error_on_mismatched_signature=False,
        ).accuracies
        for m in metrics
    }
    accs_as_table_row(accs)
    pretty_print_dict(accs)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [12]:
type_t5_metrics(task=TypeCollectionCategory.CALLABLE_RETURN)

Loading predictions for CALLABLE_RETURN: 100%|█| 50/50 [00:00<00:00, 175.70it/s,
[TypeT5TopN1 @ INFO]: n_annots: 30070, n_labels: 16520
[TypeT5TopN1 @ INFO]: Ratio: 0.5493847688726305


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
91.79 & 91.54 & 93.89 & 79.04 & 94.12
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
94.29 & 94.26 & 96.38 & 83.42 & 95.23
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
87.10 & 87.85 & 90.57 & 72.60 & 92.24
full_acc:
   full_acc: 91.79% (count=14.4k)
   full_acc_by_cat:
      FuncArg: 100.00% (count=8.4k)
      FuncReturn: 80.34% (count=6.0k)
   full_acc_by_simple:
      complex: 81.26% (count=2.6k)
      simple: 94.11% (count=11.8k)
   full_acc_label_size: 1.3824
   full_acc_pred_size: 1.3523
   full_acc_ignored_labels: 0
   n_skipped_types: 2105
full_acc_common:
   full_acc_common: 94.29% (count=9.4k)
   full_acc_common_by_cat:
      FuncArg: 94.25% (count=5.7k)
      FuncReturn: 94.36% (count=3.7k)
   full_acc_common

In [13]:
type_t5_metrics(task=TypeCollectionCategory.CALLABLE_PARAMETER)

Loading predictions for CALLABLE_PARAMETER: 100%|█| 50/50 [00:00<00:00, 173.40it
[TypeT5TopN1 @ INFO]: n_annots: 30070, n_labels: 16520
[TypeT5TopN1 @ INFO]: Ratio: 0.5493847688726305


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
79.14 & 80.54 & 82.50 & 69.32 & 83.64
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
82.56 & 84.24 & 86.76 & 70.15 & 87.46
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
73.33 & 75.64 & 76.92 & 68.17 & 77.40
full_acc:
   full_acc: 79.14% (count=12.3k)
   full_acc_by_cat:
      FuncArg: 69.34% (count=8.4k)
      FuncReturn: 100.00% (count=3.9k)
   full_acc_by_simple:
      complex: 66.99% (count=2.2k)
      simple: 81.74% (count=10.2k)
   full_acc_label_size: 1.391
   full_acc_pred_size: 1.3466
   full_acc_ignored_labels: 0
   n_skipped_types: 4183
full_acc_common:
   full_acc_common: 82.56% (count=7.8k)
   full_acc_common_by_cat:
      FuncArg: 82.55% (count=5.5k)
      FuncReturn: 82.59% (count=2.3k)
   full_acc_common_

In [14]:
type_t5_metrics(task=TypeCollectionCategory.VARIABLE)

Loading predictions for VARIABLE: 100%|█| 50/50 [00:02<00:00, 19.92it/s, project
[TypeT5TopN1 @ INFO]: n_annots: 30070, n_labels: 16520
[TypeT5TopN1 @ INFO]: Ratio: 0.5493847688726305


Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
90.39 & 92.48 & 93.83 & 83.56 & 94.11
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
93.37 & 96.36 & 98.15 & 85.36 & 95.01
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
86.90 & 88.37 & 89.35 & 81.41 & 92.92
full_acc:
   full_acc: 90.39% (count=3.5k)
   full_acc_by_cat:
      FuncArg: 100.00% (count=2.2k)
      FuncReturn: 100.00% (count=496)
      ClassAtribute: 57.25% (count=683)
      GlobalVar: 57.27% (count=110)
   full_acc_by_simple:
      complex: 78.74% (count=649)
      simple: 93.01% (count=2.9k)
   full_acc_label_size: 1.4024
   full_acc_pred_size: 1.3341
   full_acc_ignored_labels: 0
   n_skipped_types: 12994
full_acc_common:
   full_acc_common: 93.37% (count=1.9k)
   full_acc_common_by_cat:
      FuncArg: 91

In [40]:
task=TypeCollectionCategory.CALLABLE_RETURN
project = next(iter(test_set))

artifact = InferenceArtifactIO(
    artifact_root=pathlib.Path("/home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/typet5-evaluation/typet5topn1"),
    dataset=dataset,
    repository=project,
    tool_name=tool,
    task=task
)
tt5_eval, = artifact.read()
#print(tt5_eval.predictions[0])

In [41]:
for metric in AccuracyMetric.default_metrics(common_names):
    accs = tt5_eval.error_analysis(None, metric).accuracies
    accs_as_table_row(accs)
    pretty_print_dict({metric.name: accs})

Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
N/A & N/A & N/A & N/A & N/A
Accuracies on common types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
N/A & N/A & N/A & N/A & N/A
Accuracies on rare types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
N/A & N/A & N/A & N/A & N/A
full_acc:
   full_acc: 61.40% (count=171)
   full_acc_by_cat:
      FuncArg: 54.55% (count=99)
      FuncReturn: 69.70% (count=66)
      ClassAtribute: 66.67% (count=3)
      GlobalVar: 100.00% (count=3)
   full_acc_by_simple:
      complex: 53.85% (count=26)
      simple: 62.76% (count=145)
   full_acc_label_size: 1.4035
   full_acc_pred_size: 1.2865
   full_acc_ignored_labels: 0
Accuracies on all types:
header:  ['full.all', 'calibrated.all', 'calibrated.simple', 'calibrated.complex', 'base.all']
N/A & N/A & N/A & N/A & N/A
Accuracies on common

In [42]:
# Constants
# Common Type Names
from typet5.model import ModelWrapper
model = ModelWrapper.load_from_hub("MrVPlusOne/TypeT5-v7")
common_names = model.common_type_names
del model

from scripts.common.schemas import TypeCollectionCategory

# Because our analysis reviews more datapoints than these models actually regard, reuse TypeT5 metrics instead
from scripts.common.output import InferenceArtifactIO


def type_t5_metrics(task: str) -> None:
    from typet5.function_decoding import EvalResult
    assignments = EvalResult(project_roots=list(), predictions=list(), label_maps=list())

    for project in (pbar := tqdm.tqdm(test_set, desc=f"Loading predictions for {task}")):
        artifact = InferenceArtifactIO(
            artifact_root=pathlib.Path("/home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/typet5-evaluation/typet5topn1"),
            dataset=dataset,
            repository=project,
            tool_name=tool,
            task=task
        )
        pbar.set_postfix({"project": artifact.full_location()})
        
        tt5_predictions: EvalResult
        tt5_predictions, = artifact.read()
        
        assignments.project_roots.extend(tt5_predictions.project_roots)
        assignments.predictions.extend(tt5_predictions.predictions)
        assignments.label_maps.extend(tt5_predictions.label_maps)
    
    metrics = AccuracyMetric.default_metrics(common_names)
    for metric in metrics:
        accs = assignments.error_analysis(None, metric).accuracies
        # accs_as_table_row(accs)
        pretty_print_dict({metric.name: accs})

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [43]:
import tqdm
type_t5_metrics(task=TypeCollectionCategory.CALLABLE_RETURN)

Loading predictions for CALLABLE_RETURN: 100%|█| 50/50 [00:03<00:00, 14.86it/s, project=/home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/typet5-evaluation/typet5topn1/BetterTypes4Py/flopp__GpxTrackPoster/TypeT5TopN1/CALL


full_acc:
   full_acc: 71.40% (count=16.5k)
   full_acc_by_cat:
      FuncArg: 69.29% (count=8.4k)
      FuncReturn: 79.16% (count=6.0k)
      ClassAtribute: 57.58% (count=2.0k)
      GlobalVar: 59.29% (count=113)
   full_acc_by_simple:
      complex: 48.12% (count=2.8k)
      simple: 76.19% (count=13.7k)
   full_acc_label_size: 1.4214
   full_acc_pred_size: 1.314
   full_acc_ignored_labels: 0
full_acc_common:
   full_acc_common: 78.46% (count=10.7k)
   full_acc_common_by_cat:
      FuncArg: 78.11% (count=5.7k)
      FuncReturn: 79.22% (count=3.7k)
      ClassAtribute: 77.16% (count=1.1k)
      GlobalVar: 87.14% (count=70)
   full_acc_common_by_simple:
      complex: 52.32% (count=1.7k)
      simple: 83.27% (count=9.0k)
   full_acc_common_label_size: 1.3744
   full_acc_common_pred_size: 1.2879
   full_acc_common_ignored_labels: 5867
full_acc_rare:
   full_acc_rare: 58.60% (count=5.9k)
   full_acc_rare_by_cat:
      FuncArg: 57.69% (count=3.3k)
      FuncReturn: 59.60% (count=1.8k)
    