In [1]:
%load_ext autoreload
%autoreload 2

%pwd
%cd /home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/experiments

/home/benji/Documents/Uni/heidelberg/05/masterarbeit/impls/scripts/experiments


In [2]:
from scripts.common.output import ContextIO, ExtendedDatasetIO, InferredIO
from scripts.common.schemas import ContextSymbolSchema, ExtendedTypeCollectionSchema, InferredSchema
from scripts.common.schemas import TypeCollectionCategory, ContextCategory

2023-06-28 17:09:23.652042: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import tqdm
import pandas as pd
from pandera import typing as pt

In [4]:
import pathlib
from scripts.infer.structure import DatasetFolderStructure

dataset = DatasetFolderStructure(pathlib.Path(
    "/home/benji/Documents/Uni/heidelberg/05/masterarbeit/datasets/better-types-4-py-dataset"
))
ARTIFACT_ROOT = pathlib.Path()

In [5]:
def load_inference_artifacts(tool: str, task: TypeCollectionCategory | str) -> pd.DataFrame:
    return pd.concat([
        InferredIO(artifact_root=ARTIFACT_ROOT, dataset=dataset, repository=project, tool_name=tool, task=task).read()
        .assign(repository=project.name)
        .drop_duplicates(subset=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa], keep=False)
        for project in tqdm.tqdm(dataset.test_set(), desc=f"Loading inference artifacts for {tool} @ {task}")
    ], ignore_index=True)

In [6]:
context_artifacts = pd.concat([
    ContextIO(artifact_root=ARTIFACT_ROOT, dataset=dataset, repository=project).read()
        .assign(repository=project.name)
        .drop_duplicates(subset=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa], keep=False)
    for project in tqdm.tqdm(dataset.test_set(), desc="Loading context vectors")
], ignore_index=True)

dummified_mapping = {
    f"context_category_{category.value}": category.name.lower()
    for category in ContextCategory
}

context_artifacts = pd.get_dummies(context_artifacts, columns=[ContextSymbolSchema.context_category]).rename(columns=dummified_mapping)
print(context_artifacts.head(n=20).to_string())

Loading context vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 90.91it/s]

                  file            category                                      qname  loop  reassigned  nested  flow_control  builtin_source  local_source  import_source                                  qname_ssa                repository  callable_return  callable_parameter  single_target_assign  ann_assign  aug_assign  multi_target_assign  instance_attribute  for_target  with_target
0   tests/test_item.py            VARIABLE                             need_cssselect     0           0       0             0               0             0              0                           need_cssselectλ1  linw1995__data_extractor                0                   0                     1           0           0                    0                   0           0            0
1   tests/test_item.py            VARIABLE                                  need_lxml     0           0       0             0               0             0              0                                need_lxmlλ1  linw199




In [7]:
import pandas as pd

extended_ground_truths = pd.concat([
    ExtendedDatasetIO(artifact_root=ARTIFACT_ROOT, dataset=dataset, repository=project).read()
        .assign(repository=project.name)
        .drop_duplicates(subset=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa], keep=False)
    for project in tqdm.tqdm(dataset.test_set(), desc="Loading ground truths")
], ignore_index=True).fillna(pd.NA)

Loading ground truths: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 73.93it/s]


In [8]:
ground_truths_with_context = pd.merge(
    left=extended_ground_truths,
    right=context_artifacts,
    on=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa],
    how="inner",
    validate="1:1"
)

In [9]:
import dataclasses
from scripts.dataset import normalisation

@dataclasses.dataclass
class RegressionArtifacts:
    predictions_made: pd.DataFrame
    matching_raw: pd.DataFrame
    matching_depth_limited: pd.DataFrame
    matching_adjusted: pd.DataFrame
    matching_base: pd.DataFrame

def create_inputs_for_regression(tool: str, task: TypeCollectionCategory | str) -> RegressionArtifacts:
    tqdm.tqdm.pandas()
    inference = load_inference_artifacts(tool=tool, task=task)
    inference_vs_ground_truth_with_context = pd.merge(
        left=inference,
        right=ground_truths_with_context,
        on=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa],
        how="inner",     # will drop keys for any tasks except 'all', which is exactly as intended
        validate="1:1"
    )
    
    ### All samples
    # Find where predictions were made
    print(f"=== {tool}/{task}: Scoring whether predictions were made ... === ")
    predictions_made = inference_vs_ground_truth_with_context[InferredSchema.anno].notna()
    predictions_made_df = inference_vs_ground_truth_with_context.assign(yscore=predictions_made).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.depth_limited_anno, 
        ExtendedTypeCollectionSchema.adjusted_anno, ExtendedTypeCollectionSchema.base_anno,
    ])
    
    
    ### Only where predictions were made and we have a ground truth label
    inference_vs_ground_truth_with_context = inference_vs_ground_truth_with_context[
        inference_vs_ground_truth_with_context[InferredSchema.anno].notna() &
        inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.raw_anno].notna()
    ]
    
    # Find where lightly processed annotations line up
    print(f"=== {tool}/{task}: Scoring where predictions match ground truths exactly ... ===")
    matching_raw = inference_vs_ground_truth_with_context[InferredSchema.anno] == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.raw_anno]
    matching_raw_df = inference_vs_ground_truth_with_context.assign(yscore=matching_raw).drop(columns=[
        ExtendedTypeCollectionSchema.depth_limited_anno, ExtendedTypeCollectionSchema.adjusted_anno, ExtendedTypeCollectionSchema.base_anno,
    ])
    
    
    # Apply depth limiting, as every work except TypeT5 has done
    print(f"=== {tool}/{task}: Scoring where depth limited predictions match depth limited ground truths ... ===")
    depth_limited_inferred = inference_vs_ground_truth_with_context[InferredSchema.anno].progress_apply(normalisation.to_limited)
    matching_depth_limited = depth_limited_inferred == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.depth_limited_anno]
    matching_depth_df = inference_vs_ground_truth_with_context.assign(inferred_depth_limited=depth_limited_inferred, yscore=matching_depth_limited).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.adjusted_anno, ExtendedTypeCollectionSchema.base_anno,
    ])

    
    # Convert to adjusted annotations from depth limited annotations, without ground truth having typing.Any and None
    print(f"=== {tool}/{task}: Scoring where adjusted predictions match adjusted ground truths ... ===")
    print(f"=== {tool}/{task}: Dropping Ground Truths with typing.Any, None ... ===")
    inference_vs_ground_truth_with_context = inference_vs_ground_truth_with_context[
         ~inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.raw_anno].isin(["typing.Any", "Any", "None"])
    ]
    
    # Recalculate depth limited for shaping reasons
    depth_limited_inferred = inference_vs_ground_truth_with_context[InferredSchema.anno].progress_apply(normalisation.to_limited)
    adjusted_anno = depth_limited_inferred.progress_apply(normalisation.to_adjusted)
    matching_adjusted = adjusted_anno == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.adjusted_anno]
    matching_adjusted_df = inference_vs_ground_truth_with_context.assign(inferred_adjusted=adjusted_anno, yscore=matching_adjusted).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.depth_limited_anno, ExtendedTypeCollectionSchema.base_anno,
    ])
    
    # Convert to base annotation to depth limited annotations, exclude none and any, as done in TypeT5
    print(f"{tool}/{task}: Scoring where base predictions match base ground truths ...")
    base_anno = depth_limited_inferred.progress_apply(normalisation.to_base)
    matching_base = base_anno == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.base_anno]
    matching_base_df = inference_vs_ground_truth_with_context.assign(inferred_base=base_anno, yscore=matching_base).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.depth_limited_anno, ExtendedTypeCollectionSchema.adjusted_anno,
    ])
    
    print(
        f"{predictions_made_df.shape=}", 
        f"{matching_raw_df.shape=}", 
        f"{matching_depth_df.shape=}", 
        f"{matching_adjusted_df.shape=}", 
        f"{matching_base_df.shape=}", 
        sep="\n"
    )
    
    return RegressionArtifacts(
        predictions_made=predictions_made_df,
        matching_raw=matching_raw_df,
        matching_depth_limited=matching_depth_df,
        matching_adjusted=matching_adjusted_df,
        matching_base=matching_base_df
    )
    
    


In [10]:
FEATURE_SET = list(dummified_mapping.values()) + [
    ContextSymbolSchema.loop, ContextSymbolSchema.reassigned, ContextSymbolSchema.nested, ContextSymbolSchema.flow_control, ContextSymbolSchema.import_source, ContextSymbolSchema.builtin_source, ContextSymbolSchema.local_source, 
]

FEATURE_SET_WITH_SCORE = FEATURE_SET + ["yscore"]

In [30]:
import pprint
from sklearn import linear_model

def regression(features, X, y) -> None:
    lr = linear_model.LogisticRegression(
        penalty="l2",
        solver="newton-cholesky",
    )
    lr.fit(X, y)
    pprint.pprint(dict(zip(features, lr.coef_[0])))
    
    

In [23]:
def prediction_made_regression(artifacts: RegressionArtifacts) -> None:
    debug_feature_set = FEATURE_SET_WITH_SCORE + [InferredSchema.file, InferredSchema.qname, InferredSchema.anno]
    
    df = artifacts.predictions_made[debug_feature_set]
    # print(df.head(n=20).to_string(), df.dtypes, sep="\n")
    
    df = artifacts.predictions_made[FEATURE_SET_WITH_SCORE].astype(int)
    X, y = df[FEATURE_SET].to_numpy(), df["yscore"]
    
    regression(FEATURE_SET, X, y)

In [24]:
def matching_raw_regression(artifacts: RegressionArtifacts) -> None:
    debug_feature_set = FEATURE_SET_WITH_SCORE + [InferredSchema.file, ExtendedTypeCollectionSchema.raw_anno, InferredSchema.anno]

    
    df = artifacts.matching_raw[debug_feature_set]
    # print(df.head(n=20).to_string(), df.dtypes, sep="\n")
    
    df = artifacts.matching_raw[FEATURE_SET_WITH_SCORE].astype(int)
    X, y = df[FEATURE_SET].to_numpy(), df["yscore"]
    
    regression(FEATURE_SET, X, y)

# TypeT5

In [14]:
type_t5_return = create_inputs_for_regression(tool="TypeT5TopN1", task=TypeCollectionCategory.CALLABLE_RETURN)

Loading inference artifacts for TypeT5TopN1 @ CALLABLE_RETURN: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 133.82it/s]


=== TypeT5TopN1/CALLABLE_RETURN: Scoring whether predictions were made ... === 
=== TypeT5TopN1/CALLABLE_RETURN: Scoring where predictions match ground truths exactly ... ===
=== TypeT5TopN1/CALLABLE_RETURN: Scoring where depth limited predictions match depth limited ground truths ... ===


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4882/4882 [00:00<00:00, 59527.92it/s]


=== TypeT5TopN1/CALLABLE_RETURN: Scoring where adjusted predictions match adjusted ground truths ... ===
=== TypeT5TopN1/CALLABLE_RETURN: Dropping Ground Truths with typing.Any, None ... ===


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2800/2800 [00:00<00:00, 45608.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2800/2800 [00:00<00:00, 49543.13it/s]


TypeT5TopN1/CALLABLE_RETURN: Scoring where base predictions match base ground truths ...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2800/2800 [00:00<00:00, 49109.73it/s]

predictions_made_df.shape=(10987, 27)
matching_raw_df.shape=(4882, 28)
matching_depth_df.shape=(4882, 29)
matching_adjusted_df.shape=(2800, 29)
matching_base_df.shape=(2800, 29)





In [15]:
prediction_made_regression(artifacts=type_t5_return)

[[ 8.98472350e-13  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -6.39641387e-02 -3.60268302e+00
  -1.21478225e+00 -6.95137446e-01 -1.42034682e-01 -1.60439769e+00]]


In [16]:
matching_raw_regression(artifacts=type_t5_return)

[[ 5.61065950e-14  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -2.15497748e-01 -2.09162998e+00  6.60451657e-01 -2.47879189e-01]]


In [25]:
HiType4PyN1_all = create_inputs_for_regression(tool="HiType4PyN1", task="all")

Loading inference artifacts for HiType4PyN1 @ all: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 89.05it/s]


=== HiType4PyN1/all: Scoring whether predictions were made ... === 
=== HiType4PyN1/all: Scoring where predictions match ground truths exactly ... ===
=== HiType4PyN1/all: Scoring where depth limited predictions match depth limited ground truths ... ===


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14453/14453 [00:00<00:00, 61906.57it/s]


=== HiType4PyN1/all: Scoring where adjusted predictions match adjusted ground truths ... ===
=== HiType4PyN1/all: Dropping Ground Truths with typing.Any, None ... ===


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12116/12116 [00:00<00:00, 55771.29it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12116/12116 [00:00<00:00, 61175.06it/s]


HiType4PyN1/all: Scoring where base predictions match base ground truths ...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12116/12116 [00:00<00:00, 55960.08it/s]

predictions_made_df.shape=(58480, 27)
matching_raw_df.shape=(14453, 28)
matching_depth_df.shape=(14453, 29)
matching_adjusted_df.shape=(12116, 29)
matching_base_df.shape=(12116, 29)





In [31]:
prediction_made_regression(artifacts=HiType4PyN1_all)

{'ann_assign': 1.48596739784677,
 'aug_assign': -2.962645779752652,
 'builtin_source': 1.5309296095228195,
 'callable_parameter': 2.293107243243585,
 'callable_return': 3.6541024962823005,
 'flow_control': 0.4412666776085077,
 'for_target': -3.5969047853848273,
 'import_source': 1.0964056796797494,
 'instance_attribute': 3.2283371121406845,
 'local_source': 0.6820853419693751,
 'loop': -0.21829452191168996,
 'multi_target_assign': -3.591541185530923,
 'nested': -3.8777453478979926,
 'reassigned': 0.5322803036782469,
 'single_target_assign': 2.507964232802009,
 'with_target': -3.018386731659758}


In [32]:
matching_raw_regression(artifacts=HiType4PyN1_all)

{'ann_assign': 0.6405012464295174,
 'aug_assign': 0.0,
 'builtin_source': -0.6125494615718191,
 'callable_parameter': 2.116033083121507,
 'callable_return': 2.731149781703476,
 'flow_control': 0.06942834060108434,
 'for_target': 0.0,
 'import_source': -3.190280107607923,
 'instance_attribute': -5.487684111253896,
 'local_source': -3.584661965126871,
 'loop': -0.52557555736792,
 'multi_target_assign': 0.0,
 'nested': -0.08106220110543441,
 'reassigned': -0.10474777577665098,
 'single_target_assign': 0.0,
 'with_target': 0.0}
