In [106]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [107]:
%autoreload

In [108]:
import os
import pandas as pd
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [109]:
import scripts
print(scripts.__path__)
import pathlib

from scripts.common.schemas import TypeCollectionCategory
from scripts.infer.structure import DatasetFolderStructure

dataset = DatasetFolderStructure(pathlib.Path(
    "/nfs/home/bsparks/mdti4py/datasets/better-types-4-py-dataset"
))
assert dataset.dataset_root.is_dir(), f"{dataset.dataset_root} not a directory!"

print(dataset)

['/nfs/home/bsparks/mdti4py/scripts/scripts']
BetterTypes4Py @ /nfs/home/bsparks/mdti4py/datasets/better-types-4-py-dataset


In [110]:
import importlib
import experiments.inferred
import experiments.probas

In [111]:
artifact_root = pathlib.Path("/nfs/home/bsparks/mdti4py/datasets")
assert artifact_root.is_dir(), f"Cannot find {artifact_root=}"

In [None]:
importlib.reload(experiments.inferred)
groundtruth = experiments.inferred.load_groundtruths(artifact_root, dataset)

trivial_mask = groundtruth.base_anno.isin(["None", "Any"])
groundtruth = groundtruth[~trivial_mask]

display(groundtruth.shape, groundtruth.columns)
# display(groundtruth.head())

In [None]:
type4py = experiments.inferred.load_entire_inferred(artifact_root, dataset, tool_name="type4pyN1", task="all")
type4py_probas = experiments.probas.load_inferred_with_probablities(artifact_root, dataset, tool_name="type4py", task="all", inferred=type4py)

anno_no_prob = type4py_probas["probability"].isna() & type4py_probas["anno"].notna()
type4py_probas.loc[anno_no_prob, "anno"] = pd.NA

# print((type4py_probas["probability"].isna() & type4py_probas["anno"].notna()).any())#

In [None]:
typilus = experiments.inferred.load_entire_inferred(artifact_root, dataset, tool_name="typilusN1", task="all")
typilus_probas = experiments.probas.load_inferred_with_probablities(artifact_root, dataset, tool_name="typilus", task="all", inferred=typilus)

anno_no_prob = typilus_probas["probability"].isna() & typilus_probas["anno"].notna()
typilus_probas.loc[anno_no_prob, "anno"] = pd.NA

#print((typilus_probas["probability"].isna() & typilus_probas["anno"].notna()).any())#

In [None]:
typet5 = experiments.inferred.load_entire_inferred(artifact_root, dataset, tool_name="TypeT5TopN1", task="all")
typet5_probas = experiments.probas.load_inferred_with_probablities(artifact_root, dataset, tool_name="typet5", task="all", inferred=typet5)

In [None]:
ignore = ["topn"]
prediction_annos = ["anno_type4py", "anno_typilus", "anno_typet5"]
methods = ["method_type4py", "method_typilus", "method_typet5"]
probs = ["probability_type4py", "probability_typilus", "probability_typet5"]

In [None]:
type4py_adjusted = experiments.inferred.typet5_adjusted_form(type4py_probas)
typilus_adjusted = experiments.inferred.typet5_adjusted_form(typilus_probas)
typet5_adjusted = experiments.inferred.typet5_adjusted_form(typet5_probas)

aligned_adjusted = type4py_adjusted.drop(columns=ignore).merge(
    typilus_adjusted.drop(columns=ignore), 
    how="outer", 
    on=["repository", "category", "file", "qname", "qname_ssa"], 
    suffixes=("_type4py", "_typilus")
).merge(
    typet5_adjusted.drop(columns=ignore).rename(columns={"anno": "anno_typet5", "probability": "probability_typet5", "method": "method_typet5"}),
    how="outer", 
    on=["repository", "category", "file", "qname", "qname_ssa"]
)

aligned_adjusted.info()

In [None]:
type4py_base = experiments.inferred.typet5_base_form(type4py_probas)
typilus_base = experiments.inferred.typet5_base_form(typilus_probas)
typet5_base = experiments.inferred.typet5_base_form(typet5_probas)

aligned_base = type4py_base.drop(columns=ignore).merge(
    typilus_base.drop(columns=ignore), 
    how="outer", 
    on=["repository", "category", "file", "qname", "qname_ssa"], 
    suffixes=("_type4py", "_typilus")
).merge(
    typet5_base.drop(columns=ignore).rename(columns={"anno": "anno_typet5", "probability": "probability_typet5", "method": "method_typet5"}),
    how="outer", 
    on=["repository", "category", "file", "qname", "qname_ssa"]
)

aligned_base.info()

In [None]:
from scripts.common.schemas import ExtendedTypeCollectionSchema

In [None]:
importlib.reload(experiments.inferred)
joined_adjusted = experiments.inferred.join_truth_to_preds(
    truth=groundtruth,
    predictions=aligned_adjusted,
    comparable_anno=ExtendedTypeCollectionSchema.adjusted_anno,
    prediction_annos=prediction_annos,
)

adjusted_eval = experiments.inferred.evaluatable(joined_adjusted, clean_annos=prediction_annos).replace("<MISSING>", pd.NA)

print(adjusted_eval.info())
adjusted_eval

In [None]:
joined_base = experiments.inferred.join_truth_to_preds(
    truth=groundtruth,
    predictions=aligned_base,
    comparable_anno=ExtendedTypeCollectionSchema.base_anno,
    prediction_annos=["anno_type4py", "anno_typilus", "anno_typet5"]
)
joined_base 

base_eval = experiments.inferred.evaluatable(joined_base, clean_annos=prediction_annos).replace("<MISSING>", pd.NA)

print(base_eval.info())
display(base_eval)

# Aggregation

In [None]:
display(adjusted_predictions := adjusted_eval[prediction_annos])
display(adjusted_probabilities := adjusted_eval[probs])

In [None]:
import experiments.predictions

#print("Adjusted")
typilus_adjusted_accuracy = experiments.predictions.performance(
    adjusted_eval.rename(columns={"anno_typilus": "anno"}),
    total=True
)
type4py_adjusted_accuracy = experiments.predictions.performance(
    adjusted_eval.rename(columns={"anno_type4py": "anno"}),
    total=True
)
typet5_adjusted_accuracy = experiments.predictions.performance(
    adjusted_eval.rename(columns={"anno_typet5": "anno"}),
    total=True
)

#print("Base")
typilus_base_accuracy = experiments.predictions.performance(
    base_eval.rename(columns={"anno_typilus": "anno"}),
    total=True
)
type4py_base_accuracy = experiments.predictions.performance(
    base_eval.rename(columns={"anno_type4py": "anno"}),
    total=True
)
typet5_base_accuracy = experiments.predictions.performance(
    base_eval.rename(columns={"anno_typet5": "anno"}),
    total=True
)

In [None]:
common = ["observations"]
model_specific = ["predictions", "stracc", "relacc"]
ground_truth_columns = typilus_adjusted_accuracy[common] 

In [None]:
display(adjusted_baseline := pd.concat(
    [ground_truth_columns, typilus_adjusted_accuracy[model_specific], type4py_adjusted_accuracy[model_specific], typet5_adjusted_accuracy[model_specific]],
    keys=["groundtruth", "typilus", "type4py", "typet5"],
    axis=1
))
#|l|c|cc|cc|cc|
print(adjusted_baseline.to_latex(
    column_format="|l|c|cc|cc|cc|",
    float_format="%.2f",
    label="tbl:adjusted_ml_perf",
    caption=("Performance of ML models upon adjusted canonical form")
))

In [None]:
display(base_baseline := pd.concat(
    [ground_truth_columns, typilus_base_accuracy[model_specific], type4py_base_accuracy[model_specific], typet5_base_accuracy[model_specific]],
    keys=["groundtruth", "typilus", "type4py", "typet5"],
    axis=1
))

print(adjusted_baseline.to_latex(
    label="tbl:base_ml_perf",
    caption=("Performance of ML models upon base canonical form")
))

In [None]:
display(baseline_base := pd.concat(
    [typilus_base_accuracy, type4py_base_accuracy, typet5_base_accuracy],
    keys=["typilus", "type4py", "typet5"],
    axis=1
))

# Baseline majority vote

In [None]:
!pip install scikit-learn

In [None]:
import experiments.predictions
import pandas as pd

def hard_majority(df: pd.DataFrame) -> pd.DataFrame:
    predictions = df[prediction_annos]
    
    hm = predictions.mode(axis="columns")[0]
    # assert not hm.isna().any(), f"NAs: {hm.isna().sum()}"
    return experiments.predictions.performance(pd.merge(
        left=df, 
        right=hm.rename("anno"), 
        left_index=True, 
        right_index=True
    ), total=True)

display(adjusted_hard_majority := hard_majority(adjusted_eval))
display(base_hard_majority := hard_majority(base_eval))

In [None]:
import collections

def soft_majority_vote(df: pd.DataFrame) -> pd.DataFrame:
    def soft_majority(row) -> str:
        weighted_votes = collections.defaultdict(float)
        for pred_col, prob_col in zip(prediction_annos, probs):
            # print(pred_col, type(row[pred_col]), prob_col, type(row[prob_col]))
            weighted_votes[row[pred_col]] += row[prob_col]
        
        return max(weighted_votes, key=weighted_votes.get)

    df[probs] = df[probs].fillna(0)
    sm = df.apply(soft_majority, axis=1)
    # assert not sm.isna().any(), f"NAs: {sm.isna().sum()}"
    return experiments.predictions.performance(pd.merge(
        left=df, 
        right=sm.rename("anno"), 
        left_index=True, 
        right_index=True
    ), total=True)

display(adjusted_soft_majority := soft_majority_vote(adjusted_eval))
display(base_soft_majority := soft_majority_vote(base_eval))

In [None]:
identical_cols = ["predictions", "observations", "unassigned"]

In [None]:
pred_occ = adjusted_hard_majority[["predictions", "observations"]]

disp_adj_hm = adjusted_hard_majority.drop(columns=identical_cols)
disp_adj_sm = adjusted_soft_majority.drop(columns=identical_cols)

display(disp_adj_voting := pd.concat(
    [pred_occ, disp_adj_hm, disp_adj_sm],
    keys=["Statistics", "Hard Majority", "Soft Majority"],
    axis=1
))

print(disp_adj_voting.to_latex(
    float_format="%.2f",
    #label="tbl:adjusted_ml_perf",
    caption=("Performance of Voting Procedures upon adjusted canonical form")
))


In [None]:
pred_occ = base_hard_majority[["predictions", "observations"]]

disp_base_hm = base_hard_majority.drop(columns=identical_cols)
disp_base_sm = base_soft_majority.drop(columns=identical_cols)

display(disp_base_voting := pd.concat(
    [pred_occ, disp_base_hm, disp_base_sm],
    keys=["Statistics", "Hard Majority", "Soft Majority"],
    axis=1
))

print(disp_base_voting.to_latex())

In [None]:
display(disp_voting := pd.concat(
    [
        pd.concat([pred_occ], keys=["Statistics"], axis=1),
        pd.concat([disp_adj_hm.drop(columns="matches"), disp_base_hm.drop(columns="matches")], keys=["adjusted", "base"], axis=1),
        pd.concat([disp_adj_sm.drop(columns="matches"), disp_base_sm.drop(columns="matches")], keys=["adjusted", "base"], axis=1),
    ],
    keys=["", "Hard Majority", "Soft Majority"], 
    axis=1
))

print(disp_voting.to_latex(
    float_format="%.2f",
    label="tbl:voting_ml_perf",
    caption=("Performance of Voting Procedures upon adjusted \& base canonical forms")
))

# Ensemble Learning

# Basheer

In [None]:
import abc, dataclasses, enum

@dataclasses.dataclass(frozen=True)
class BaheerMetadata:
    repository: str
    file: str
    category: TypeCollectionCategory
    qname_ssa: str


@dataclasses.dataclass(frozen=True)
class BaheerOpinion:
    prediction: str
    probability: float


@dataclasses.dataclass(frozen=True)
class BaheerResult:
    strategy: str
    agent_tag: str
    metadata: BaheerMetadata
    opinion: BaheerOpinion


class Confidence(enum.IntEnum):
    High = enum.auto()
    Low = enum.auto()

    @staticmethod
    def make(prob: float, threshold: float) -> "Confidence":
        if prob >= threshold:
            return Confidence.High
        else:
            return Confidence.Low

class Strategy(abc.ABC):
    def __init__(self, agent1_tag: str, agent2_tag: str) -> None:
        self.agent1_tag = agent1_tag
        self.agent2_tag = agent2_tag
    
    def apply(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion) -> BaheerResult:
        ...


class BaheerResolver(abc.ABC):
    def __init__(self, agent1_tag: str, agent1_threshold: float, agent2_tag: str, agent2_threshold: float) -> None:
        self.agent1_tag = agent1_tag
        self.agent2_tag = agent2_tag

        self.agent1_threshold = agent1_threshold
        self.agent2_threshold = agent2_threshold
    
    @abc.abstractmethod
    def resolve(self, agent1_pred: str, agent1_prob: float, agent2_pred: str, agent2_prob: float) -> BaheerResult:
        ...

In [None]:
class Submitting(Strategy):
    """Agent 1 submits to Agent 2"""
    def apply(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion) -> BaheerResult:
        return BaheerResult(strategy="submit", agent_tag=self.agent2_tag, metadata=metadata, opinion=agent2)

class Forcing(Strategy):
    """Agent 1 imposes its will over that of Agent 2"""
    def apply(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion) -> BaheerResult:
        return BaheerResult(strategy="force", agent_tag=self.agent1_tag, metadata=metadata, opinion=agent1)

class Delegation(Strategy):
    def __init__(self, agent1_tag: str, agent2_tag: str, agent3_tag: str, agent3_opinions: dict[BaheerMetadata, BaheerOpinion]) -> None:
        super().__init__(agent1_tag, agent2_tag)
        self.agent3_tag = agent3_tag
        self.agent3_opinions = agent3_opinions

    def apply(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion) -> BaheerResult:
        knowledge = self.agent3_opinions.get(metadata, (pd.NA, float("-inf")))
        return BaheerResult(strategy="delegate", agent_tag=self.agent3_tag, *knowledge)
    

class Ignoring(Strategy):
    def apply(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion) -> BaheerResult:
        return BaheerResult(strategy="ignore", agent_tag="IGNORED", metadata=metadata, opinion=BaheerOpinion(prediction=pd.NA, probability=float("-inf")))

class Negotiation(Strategy):
    def apply(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion) -> BaheerResult:
        if agent1.probability > agent2.probability:
            return BaheerResult(strategy="negotiate", agent_tag=self.agent1_tag, metadata=metadata, opinion=agent1)
        else:
            return BaheerResult(strategy="negotiate", agent_tag=self.agent2_tag, metadata=metadata, opinion=agent2)

In [None]:
class BaheerStrongConflictResolver(BaheerResolver):
    def __init__(self, agent1_tag: str, agent1_threshold: float, str, agent2_tag: str, agent2_threshold: float, agent3_opinions: dict[BaheerMetadata, BaheerOpinion]) -> None:
        super().__init__(agent1_tag, agent1_threshold, agent2_tag, agent2_threshold)
        self.agent3_opinions = agent3_opinions

    def resolve(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion, agent3: pd.DataFrame) -> BaheerResult:
        match Confidence.make(agent1.probability, self.agent1_threshold), Confidence.make(agent2.probability, self.agent2_threshold):
            case (Confidence.High, Confidence.High) | (Confidence.Low, Confidence.Low):
                return Delegation(self.agent1_tag, self.agent2_tag, self.agent3_opinions).apply(metadata, agent1, agent2)

            case (Confidence.High, Confidence.Low):
                return Forcing(self.agent1_tag, self.agent2_tag).apply(metadata, agent1, agent2)
        
            case (Confidence.Low, Confidence.High):
                return Forcing(self.agent2_tag, self.agent1_tag).apply(metadata, agent2, agent1)
        

In [None]:
class BaheerWeakConflictResolver(BaheerResolver):
    def resolve(self, metadata: BaheerMetadata, agent1: BaheerOpinion, agent2: BaheerOpinion) -> BaheerResult:
        match Confidence.make(agent1.probability, self.agent1_threshold), Confidence.make(agent2.probability, self.agent2_threshold):
            case (Confidence.Low, Confidence.Low):
                return Ignoring(self.agent1_tag, self.agent2_tag).apply(metadata, agent1, agent2)

            case (Confidence.High, Confidence.High):
                return Negotiation(self.agent1_tag, self.agent2_tag).apply(metadata, agent1, agent2)

            case (Confidence.High, Confidence.Low):
                return Submitting(self.agent1_tag, self.agent2_tag).apply(metadata, agent1, agent2)

            case (Confidence.Low, Confidence.High):
                return Submitting(self.agent2_tag, self.agent1_tag).apply(metadata, agent2, agent1)

In [None]:
def baheer(
    agent1: pd.DataFrame, agent1_tag: str, agent1_threshold: float,
    agent2: pd.DataFrame, agent2_tag: str, agent2_threshold: float,
    agent3: pd.DataFrame, agent3_tag: str
) -> pd.DataFrame:
    assert agent1.shape[0] == agent2.shape[0] == agent3.shape[0]
    # conflict_level = (agent1.pred.fillna("<MISSING>") == agent2.pred.fillna("<MISSING>")).sum() / len(agent1.pred)
    conflict_level = (agent1.pred == agent2.pred).sum() / len(agent1.pred)

    print(conflict_level)

    for agent in (agent1, agent2):
        agent.pred = agent.pred.fillna("<MISSING>")
        agent.probability = agent.probability.fillna(0.0)
            
    if conflict_level >= 0.5:
        negotiator: dict[BaheerMetadata, BaheerOpinion] = {
            BaheerMetadata(repository, file, cat, qname_ssa): BaheerOpinion(pred, prob)
            for file, cat, qname_ssa, pred, prob in agent3[["repository", "file", "category", "qname_ssa", "pred", "probability"]].itertuples(index=False)
            if pd.notna(pred)
        }
        resolver = BaheerStrongConflictResolver(agent1_tag, agent1_threshold, agent2_tag, agent2_threshold, agent3_tag, negotiator)
    else:
        resolver = BaheerWeakConflictResolver(agent1_tag, agent1_threshold, agent2_tag, agent2_threshold)

    method = type(resolver).__qualname__

    aligned = pd.merge(
        left=agent1,
        right=agent2,
        on=["repository", "file", "category", "qname", "qname_ssa"],
        how="outer",
        suffixes=("_agent1", "_agent2")
    )
    
    resolutions = []

    for row in aligned[["repository", "file", "category", "qname", "qname_ssa", "pred_agent1", "probability_agent1", "pred_agent2", "probability_agent2"]].itertuples(index=False):
        # print(len(row), row)
        repository, file, cat, qname, qname_ssa, agent1_pred, agent1_prob, agent2_pred, agent2_prob = row
        metadata = BaheerMetadata(repository, file, cat, qname_ssa)
        agent1_opinion = BaheerOpinion(agent1_pred, agent1_prob)
        agent2_opinion = BaheerOpinion(agent2_pred, agent2_prob)
    
        resolution = resolver.resolve(metadata, agent1_opinion, agent2_opinion)
        resolutions.append((
            resolution.metadata.repository, resolution.metadata.file, resolution.metadata.category, 
            qname, qname_ssa,
            resolution.agent_tag, resolution.opinion.prediction, resolution.opinion.probability, resolution.strategy
        ))

    df_resolutions = pd.DataFrame(resolutions, columns=["repository", "file", "category", "qname", "qname_ssa", "agent", "pred", "probability", "strategy"]).assign(method=method)
    return df_resolutions
    

In [None]:
# Test basheer implementation
# assert BaheerStrongConflict(agent1_tag="father", agent1_threshold=0.6, agent2_tag="mother"

In [None]:
import itertools, pandas as pd

def baheer_frontend(model_predictions: pd.DataFrame, agent1_tag: str, agent1_threshold: float, agent2_tag: str, agent2_threshold: float, agent3_tag: str) -> None:
    agent1_mapping = {
        f"anno_{agent1_tag}": "pred",
        f"probability_{agent1_tag}": "probability",
    }
    agent2_mapping = {
        f"anno_{agent2_tag}": "pred",
        f"probability_{agent2_tag}": "probability",
    }
    agent3_mapping = {
        f"anno_{agent3_tag}": "pred",
        f"probability_{agent3_tag}": "probability",
    }

    # print(adjusted_eval.columns)
    
    agent1 = model_predictions.drop(columns=["gt_anno"]).rename(columns=agent1_mapping).drop(columns=list(agent2_mapping) + list(agent3_mapping))
    agent2 = model_predictions.drop(columns=["gt_anno"]).rename(columns=agent2_mapping).drop(columns=list(agent1_mapping) + list(agent3_mapping))
    agent3 = model_predictions.drop(columns=["gt_anno"]).rename(columns=agent3_mapping).drop(columns=list(agent1_mapping) + list(agent2_mapping))

    # print(agent1.columns, agent2.columns, agent3.columns)

    b = baheer(
        agent1, agent1_tag, agent1_threshold, 
        agent2, agent2_tag, agent2_threshold, 
        agent3, agent3_tag
    )
    assert len(b) == len(model_predictions)

    return b

    #experiments.predictions.performance(
    

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.preprocessing import label_binarize, LabelEncoder

precision_recall_fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

for model in ("type4py", "typilus", "typet5"):
    for perf_eval, ax in zip([adjusted_eval, base_eval], [ax1, ax2]):        
        PrecisionRecallDisplay.from_predictions(
            y_true=perf_eval[f"anno_{model}"] == perf_eval["gt_anno"],
            y_pred=perf_eval[f"probability_{model}"],
            name=model,
            ax=ax,
        )

ax1.set_title("Adjusted")
ax2.set_title("Base")

In [None]:
%%capture

import numpy as np

configs = []

agents = ["type4py", "typilus", "typet5"]
# agent_thresholds = dict(zip(agents, [
def baheer_experiment():
    for agent1_tag, agent2_tag, agent3_tag in itertools.permutations(agents, r=3):
        for agent1_threshold, agent2_threshold in itertools.combinations_with_replacement(np.linspace(0, 1, 21).tolist(), r=2):
            adjusted_basheer = baheer_frontend(adjusted_eval, agent1_tag, agent1_threshold, agent2_tag, agent2_threshold, agent3_tag).rename(columns={"pred": "anno"})
            assert len(adjusted_basheer) == len(adjusted_eval)    
        
            #print(f"{agent1_tag=}, {agent1_threshold=}, {agent2_tag=}, {agent2_threshold=}, {agent3_tag=}")
            perf = pd.merge(
                left=adjusted_basheer, right=adjusted_eval, on=["repository", "file", "category", "qname", "qname_ssa"]
            )
            # display(perf[["repository", "file", "qname_ssa", "gt_anno", "anno", "strategy"] + [f"anno_{agent}" for agent in agents]])
            #print(perf["strategy"].value_counts(normalize=True))
            #print(perf["agent"].value_counts(normalize=True))
    
            p = experiments.predictions.performance(perf)
            configs.append((agent1_tag, agent2_tag, agent3_tag, agent1_threshold, agent2_threshold, p))
        
            print("\n\n")

In [None]:
if False:
    # configs.append((agent1_tag, agent2_tag, agent3_tag), (agent1_threshold, agent2_threshold), p)
    config_df = pd.DataFrame(configs, columns=["agent1", "agent2", "agent3", "agent1_threshold", "agent2_threshold", "performance"])
    display(config_df.sort_values(by=["performance"], ascending=False))