In [139]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [140]:
from scripts.common.output import ContextIO, ExtendedDatasetIO, InferredIO
from scripts.common.schemas import ContextSymbolSchema, ExtendedTypeCollectionSchema, InferredSchema
from scripts.common.schemas import TypeCollectionCategory, ContextCategory

In [141]:
import tqdm
import pandas as pd
from pandera import typing as pt

In [142]:
import pathlib
from scripts.infer.structure import DatasetFolderStructure

dataset = DatasetFolderStructure(pathlib.Path(
    "/nfs/data/students/bsparks/mdti4py-dataset-pool/cdt4py"
))
ARTIFACT_ROOT = pathlib.Path("~/mdti4py/datasets")

In [143]:
def load_inference_artifacts(tool: str, task: TypeCollectionCategory | str) -> pd.DataFrame:
    artifacts = []
    for project in tqdm.tqdm(dataset.test_set(), desc=f"Loading inference artifacts for {tool} @ {task}"):
        try:
            artifacts.append(
                InferredIO(artifact_root=ARTIFACT_ROOT, dataset=dataset, repository=project, tool_name=tool, task=task).read()
                .assign(repository=str(dataset.author_repo(project)))
                .replace("...", pd.NA)
                .drop_duplicates(subset=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa], keep=False)
            )
        except FileNotFoundError:
            continue
    return pd.concat(artifacts, ignore_index=True)

In [144]:
context_artifacts = pd.concat([
    ContextIO(artifact_root=ARTIFACT_ROOT, dataset=dataset, repository=project).read()
        .assign(repository=str(dataset.author_repo(project)))
        .drop_duplicates(subset=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa], keep=False)
    for project in tqdm.tqdm(dataset.test_set(), desc="Loading context vectors")
], ignore_index=True)

dummified_mapping = {
    f"context_category_{category.value}": category.name.lower()
    for category in ContextCategory
}

context_artifacts = pd.get_dummies(context_artifacts, columns=[ContextSymbolSchema.context_category], dtype=int).rename(columns=dummified_mapping) \
    .drop(columns=["builtin_source", "local_source", "import_source"])
display(context_artifacts.head(n=20))

Loading context vectors: 100%|███████████████████████████████████████████████████| 1551/1551 [00:10<00:00, 154.12it/s]


Unnamed: 0,file,category,qname,loop,reassigned,nested,flow_control,qname_ssa,repository,callable_return,callable_parameter,single_target_assign,ann_assign,aug_assign,multi_target_assign,instance_attribute,for_target,with_target
0,db.py,VARIABLE,FScore,0,0,0,0,FScoreλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
1,db.py,VARIABLE,YearStat,0,0,0,0,YearStatλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
2,db.py,VARIABLE,Quarter,0,0,0,0,Quarterλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
3,db.py,VARIABLE,FilterOption,0,0,0,0,FilterOptionλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
4,db.py,VARIABLE,RankOption,0,0,0,0,RankOptionλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
5,db.py,VARIABLE,YEAR_STAT,0,0,0,0,YEAR_STATλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
6,db.py,VARIABLE,YEAR_FSCORE,0,0,0,0,YEAR_FSCOREλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
7,db.py,VARIABLE,client,0,0,0,0,clientλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
8,db.py,VARIABLE,db,0,0,0,0,dbλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0
9,db.py,VARIABLE,DIVIDEND_TAX_RATE,0,0,0,0,DIVIDEND_TAX_RATEλ1,0hoo__flask-snowball,0,0,1,0,0,0,0,0,0


In [145]:
import pandas as pd

extended_ground_truths = pd.concat([
    ExtendedDatasetIO(artifact_root=ARTIFACT_ROOT, dataset=dataset, repository=project).read()
        .assign(repository=str(dataset.author_repo(project)))
        .drop_duplicates(subset=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa], keep=False)
    for project in tqdm.tqdm(dataset.test_set(), desc="Loading ground truths")
], ignore_index=True).fillna(pd.NA)

Loading ground truths: 100%|█████████████████████████████████████████████████████| 1551/1551 [00:13<00:00, 117.34it/s]


In [146]:
display(context_artifacts.shape)
display(extended_ground_truths.shape)

(1211154, 18)

(1201979, 9)

In [147]:
ground_truths_with_context = pd.merge(
    left=extended_ground_truths,
    right=context_artifacts,
    on=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname, ContextSymbolSchema.qname_ssa],
    how="outer",
    indicator=True
)

In [148]:
display(ground_truths_with_context["_merge"].value_counts(normalize=True))

_merge
both          0.991550
right_only    0.008011
left_only     0.000439
Name: proportion, dtype: float64

In [149]:
ground_truths_with_context = pd.merge(
    left=extended_ground_truths,
    right=context_artifacts,
    on=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa],
    how="inner",
)

In [150]:
import dataclasses
from scripts.dataset import normalisation

@dataclasses.dataclass
class RegressionArtifacts:
    predictions_made: pd.DataFrame
    #matching_raw: pd.DataFrame
    #matching_depth_limited: pd.DataFrame
    #matching_adjusted: pd.DataFrame
    #matching_base: pd.DataFrame

def create_inputs_for_regression(tool: str, task: TypeCollectionCategory | str) -> RegressionArtifacts:
    tqdm.tqdm.pandas()
    inference = load_inference_artifacts(tool=tool, task=task)
    inference_vs_ground_truth_with_context = pd.merge(
        left=inference,
        right=ground_truths_with_context,
        on=["repository", ContextSymbolSchema.file, ContextSymbolSchema.category, ContextSymbolSchema.qname_ssa],
        how="inner",
        validate="1:1"
    )
    
    ### All samples
    # Find where predictions were made
    print(f"=== {tool}/{task}: Scoring whether predictions were made ... === ")
    predictions_made = inference_vs_ground_truth_with_context[InferredSchema.anno].notna()
    predictions_made_df = inference_vs_ground_truth_with_context.assign(yscore=predictions_made).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.depth_limited_anno, 
        ExtendedTypeCollectionSchema.adjusted_anno, ExtendedTypeCollectionSchema.base_anno,
    ])

    # Merge Single Assign and AnnAssign together, as tools never get to see AnnAssign due to masking
    predictions_made_df["single_target_assign"] += predictions_made_df["ann_assign"]
    predictions_made_df = predictions_made_df.drop(columns=["ann_assign"])
    assert not (predictions_made_df["single_target_assign"] > 1).any()
    
    """
    ### Only where predictions were made and we have a ground truth label
    inference_vs_ground_truth_with_context = inference_vs_ground_truth_with_context[
        inference_vs_ground_truth_with_context[InferredSchema.anno].notna() &
        inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.raw_anno].notna()
    ]
    
    # Find where lightly processed annotations line up
    print(f"=== {tool}/{task}: Scoring where predictions match ground truths exactly ... ===")
    matching_raw = inference_vs_ground_truth_with_context[InferredSchema.anno] == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.raw_anno]
    matching_raw_df = inference_vs_ground_truth_with_context.assign(yscore=matching_raw).drop(columns=[
        ExtendedTypeCollectionSchema.depth_limited_anno, ExtendedTypeCollectionSchema.adjusted_anno, ExtendedTypeCollectionSchema.base_anno,
    ])
    
    
    # Apply depth limiting, as every work except TypeT5 has done
    print(f"=== {tool}/{task}: Scoring where depth limited predictions match depth limited ground truths ... ===")
    depth_limited_inferred = inference_vs_ground_truth_with_context[InferredSchema.anno].progress_apply(normalisation.to_limited)
    matching_depth_limited = depth_limited_inferred == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.depth_limited_anno]
    matching_depth_df = inference_vs_ground_truth_with_context.assign(inferred_depth_limited=depth_limited_inferred, yscore=matching_depth_limited).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.adjusted_anno, ExtendedTypeCollectionSchema.base_anno,
    ])

    
    # Convert to adjusted annotations from depth limited annotations, without ground truth having typing.Any and None
    print(f"=== {tool}/{task}: Scoring where adjusted predictions match adjusted ground truths ... ===")
    print(f"=== {tool}/{task}: Dropping Ground Truths with typing.Any, None ... ===")
    inference_vs_ground_truth_with_context = inference_vs_ground_truth_with_context[
         ~inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.raw_anno].isin(["typing.Any", "Any", "None"])
    ]
    
    # Recalculate depth limited for shaping reasons
    depth_limited_inferred = inference_vs_ground_truth_with_context[InferredSchema.anno].progress_apply(normalisation.to_limited)
    adjusted_anno = depth_limited_inferred.progress_apply(normalisation.to_adjusted)
    matching_adjusted = adjusted_anno == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.adjusted_anno]
    matching_adjusted_df = inference_vs_ground_truth_with_context.assign(inferred_adjusted=adjusted_anno, yscore=matching_adjusted).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.depth_limited_anno, ExtendedTypeCollectionSchema.base_anno,
    ])
    
    # Convert to base annotation to depth limited annotations, exclude none and any, as done in TypeT5
    print(f"{tool}/{task}: Scoring where base predictions match base ground truths ...")
    base_anno = depth_limited_inferred.progress_apply(normalisation.to_base)
    matching_base = base_anno == inference_vs_ground_truth_with_context[ExtendedTypeCollectionSchema.base_anno]
    matching_base_df = inference_vs_ground_truth_with_context.assign(inferred_base=base_anno, yscore=matching_base).drop(columns=[
        ExtendedTypeCollectionSchema.raw_anno, ExtendedTypeCollectionSchema.depth_limited_anno, ExtendedTypeCollectionSchema.adjusted_anno,
    ])
    
    print(
        f"{predictions_made_df.shape=}", 
        f"{matching_raw_df.shape=}", 
        f"{matching_depth_df.shape=}", 
        f"{matching_adjusted_df.shape=}", 
        f"{matching_base_df.shape=}", 
        sep="\n"
    )"""
    print(f"{predictions_made_df.shape=}")
    
    return RegressionArtifacts(
        predictions_made=predictions_made_df,
    )
    
    


In [151]:
FEATURE_SET = list(dummified_mapping.values()) + [
    ContextSymbolSchema.loop, ContextSymbolSchema.reassigned, ContextSymbolSchema.nested, ContextSymbolSchema.flow_control
]
FEATURE_SET.remove("ann_assign")

FEATURE_SET_WITH_SCORE = FEATURE_SET + ["yscore"]

In [152]:
import pprint
from sklearn import linear_model

def regression(features, X, y) -> pd.DataFrame:
    lr = linear_model.LogisticRegression(
        penalty="l2",
        solver="newton-cholesky",
    )
    lr.fit(X, y)

    feature2coef = dict(zip(features, lr.coef_[0]))
    pprint.pprint(feature2coef)

    return pd.DataFrame({
        feature: [coef] for feature, coef in feature2coef.items()
    })

In [153]:
def prediction_made_regression(artifacts: RegressionArtifacts) -> pd.DataFrame:
    debug_feature_set = FEATURE_SET_WITH_SCORE + [InferredSchema.file, InferredSchema.qname, InferredSchema.anno]
    
    df = artifacts.predictions_made[debug_feature_set]
    # print(df.head(n=20).to_string(), df.dtypes, sep="\n")
    
    df = artifacts.predictions_made[FEATURE_SET_WITH_SCORE].astype(int)
    X, y = df[FEATURE_SET].to_numpy(), df["yscore"]
    
    return regression(FEATURE_SET, X, y)

In [154]:
def matching_raw_regression(artifacts: RegressionArtifacts) -> None:
    debug_feature_set = FEATURE_SET_WITH_SCORE + [InferredSchema.file, ExtendedTypeCollectionSchema.raw_anno, InferredSchema.anno]

    
    df = artifacts.matching_raw[debug_feature_set]
    # print(df.head(n=20).to_string(), df.dtypes, sep="\n")
    
    df = artifacts.matching_raw[FEATURE_SET_WITH_SCORE].astype(int)
    X, y = df[FEATURE_SET].to_numpy(), df["yscore"]
    
    regression(FEATURE_SET, X, y)

# TypeT5

In [155]:
type_t5 = create_inputs_for_regression(tool="TypeT5TopN1", task="all")

Loading inference artifacts for TypeT5TopN1 @ all: 100%|█████████████████████████| 1551/1551 [00:10<00:00, 150.06it/s]


=== TypeT5TopN1/all: Scoring whether predictions were made ... === 
predictions_made_df.shape=(1174845, 23)


In [156]:
typet5_coeffs = prediction_made_regression(artifacts=type_t5)

{'aug_assign': -2.3865499353745774,
 'callable_parameter': 3.1817187701217877,
 'callable_return': 4.795902008201029,
 'flow_control': -0.8457234026173808,
 'for_target': -2.8753578247329923,
 'instance_attribute': 4.143336029232435,
 'loop': -1.348879839882603,
 'multi_target_assign': -3.9626509130177143,
 'nested': -2.52800969456345,
 'reassigned': -1.5677212407647376,
 'single_target_assign': 1.0716510583923338,
 'with_target': -3.9680491925944796}


# Type4Py

In [157]:
type4py = create_inputs_for_regression(tool="type4pyN1", task="all")

Loading inference artifacts for type4pyN1 @ all: 100%|███████████████████████████| 1551/1551 [00:10<00:00, 146.84it/s]


=== type4pyN1/all: Scoring whether predictions were made ... === 
predictions_made_df.shape=(1201447, 23)


In [158]:
type4py_coeffs = prediction_made_regression(artifacts=type4py)

{'aug_assign': -3.5718594611275853,
 'callable_parameter': 2.859435512696114,
 'callable_return': 2.21768352277813,
 'flow_control': -0.8040044663940237,
 'for_target': -4.416799564486529,
 'instance_attribute': 6.777593478654966,
 'loop': 0.3284316105886974,
 'multi_target_assign': -3.9084851360610626,
 'nested': 0.23722451037931022,
 'reassigned': -1.378490218220115,
 'single_target_assign': 3.3731944074120177,
 'with_target': -3.330762761017993}


# Typilus

In [159]:
typilus = create_inputs_for_regression(tool="typilusN1", task="all")

Loading inference artifacts for typilusN1 @ all: 100%|███████████████████████████| 1551/1551 [00:10<00:00, 145.97it/s]


=== typilusN1/all: Scoring whether predictions were made ... === 
predictions_made_df.shape=(1201399, 23)


In [160]:
typilus_coeffs = prediction_made_regression(artifacts=typilus)

{'aug_assign': -3.0013803543677233,
 'callable_parameter': 2.7062961454900387,
 'callable_return': 3.8744674952626097,
 'flow_control': -1.084221136862369,
 'for_target': -5.347908255707595,
 'instance_attribute': 6.1509188142034565,
 'loop': 1.0624195615165506,
 'multi_target_assign': -4.281213825044946,
 'nested': -0.4638672598142172,
 'reassigned': -4.436911382588121,
 'single_target_assign': 3.4890108030994256,
 'with_target': -3.590190824001904}


# HiTyper

In [161]:
hityper = create_inputs_for_regression(tool="HiTyperNoML", task="all")

Loading inference artifacts for HiTyperNoML @ all: 100%|█████████████████████████| 1551/1551 [00:10<00:00, 144.91it/s]


=== HiTyperNoML/all: Scoring whether predictions were made ... === 
predictions_made_df.shape=(1193685, 23)


In [162]:
hityper_coeffs = prediction_made_regression(artifacts=hityper)

{'aug_assign': -2.9407768110538197,
 'callable_parameter': 3.207847223613874,
 'callable_return': 5.995557788516962,
 'flow_control': 0.44832911075975496,
 'for_target': -2.602917028528865,
 'instance_attribute': -2.685296190594103,
 'loop': -0.1721123272411888,
 'multi_target_assign': -2.6791575291484566,
 'nested': -2.547257201116788,
 'reassigned': 0.39555761429452607,
 'single_target_assign': 4.38902416281002,
 'with_target': -2.6842816168544004}


# Accumulate Table

In [163]:
display(coeff_table := pd.concat(
    [typilus_coeffs, type4py_coeffs, typet5_coeffs, hityper_coeffs],
    keys=["Typilus", "Type4Py", "TypeT5", "HiTyperNoML"],
    axis=0,
).reset_index(level=1, drop=True))

Unnamed: 0,callable_return,callable_parameter,single_target_assign,aug_assign,multi_target_assign,instance_attribute,for_target,with_target,loop,reassigned,nested,flow_control
Typilus,3.874467,2.706296,3.489011,-3.00138,-4.281214,6.150919,-5.347908,-3.590191,1.06242,-4.436911,-0.463867,-1.084221
Type4Py,2.217684,2.859436,3.373194,-3.571859,-3.908485,6.777593,-4.4168,-3.330763,0.328432,-1.37849,0.237225,-0.804004
TypeT5,4.795902,3.181719,1.071651,-2.38655,-3.962651,4.143336,-2.875358,-3.968049,-1.34888,-1.567721,-2.52801,-0.845723
HiTyperNoML,5.995558,3.207847,4.389024,-2.940777,-2.679158,-2.685296,-2.602917,-2.684282,-0.172112,0.395558,-2.547257,0.448329


In [164]:
print(coeff_table.rename(columns={
    "callable_return": "Ret",
    "callable_parameter": "Param",
    "single_target_assign": "Assgn",
    "aug_assign": "AugAssgn",
    "multi_target_assign": "MultAssgn",
    "instance_attribute": "InstAttr",
    "for_target": "ForInit",
    "with_target": "WithInit",
    "loop": "Loop",
    "reassigned": "Reassigned",
    "nested": "Nested",
    "flow_control": "Flow"
}).to_latex(float_format="{:.2f}".format))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
 & Ret & Param & Assgn & AugAssgn & MultAssgn & InstAttr & ForInit & WithInit & Loop & Reassigned & Nested & Flow \\
\midrule
Typilus & 3.87 & 2.71 & 3.49 & -3.00 & -4.28 & 6.15 & -5.35 & -3.59 & 1.06 & -4.44 & -0.46 & -1.08 \\
Type4Py & 2.22 & 2.86 & 3.37 & -3.57 & -3.91 & 6.78 & -4.42 & -3.33 & 0.33 & -1.38 & 0.24 & -0.80 \\
TypeT5 & 4.80 & 3.18 & 1.07 & -2.39 & -3.96 & 4.14 & -2.88 & -3.97 & -1.35 & -1.57 & -2.53 & -0.85 \\
HiTyperNoML & 6.00 & 3.21 & 4.39 & -2.94 & -2.68 & -2.69 & -2.60 & -2.68 & -0.17 & 0.40 & -2.55 & 0.45 \\
\bottomrule
\end{tabular}

