In [5]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm

# ---------------------------------------------------
# 🔹 1. Lade und kombiniere die Daten
# ---------------------------------------------------
dfs = []

# Dateipfade zu deinen drei CSV-Dateien
paths = {
    "zeroshot": "/home/phisch/multimodal/test_results/model_scores_zero-shot.csv",
    "linear_probe": "/home/phisch/multimodal/test_results/model_scores_linear_probe.csv",
    "retrieval": "/home/phisch/multimodal/test_results/model_scores_retrieval.csv"
}

# Funktion zum Einlesen
def load_and_filter(path, task_name, method_note, frac):
    df = pd.read_csv(path)
    df = df[(df["method_notes"] == method_note) & (df["dataset_fraction"] == frac)]
    df = df[df["metric"] == "Top1Accuracy"]
    df["task"] = task_name
    return df[["model_name", "dataset", "score", "task"]].rename(columns={"model_name": "model", "score": "accuracy"})

# Lade alle drei Teilmengen
dfs.append(load_and_filter(paths["zeroshot"], "zeroshot", "1_templates", "1-aug"))
dfs.append(load_and_filter(paths["linear_probe"], "linear_probe", "last_image_layer", "1-aug"))

# Retrieval braucht Sonderbehandlung
retrieval = pd.read_csv(paths["retrieval"])
retrieval = retrieval[retrieval["model"].str.contains("1-aug")]
retrieval_subset = retrieval[["model", "dataset", "text_retrieval_recall@1"]].copy()
retrieval_subset["accuracy"] = retrieval_subset["text_retrieval_recall@1"]
retrieval_subset["task"] = "retrieval"
retrieval_subset = retrieval_subset.rename(columns={"model": "model", "dataset": "dataset"})
dfs.append(retrieval_subset[["model", "dataset", "accuracy", "task"]])

# Verbinde alle Aufgaben
df_all = pd.concat(dfs, ignore_index=True)

In [8]:
df_all

Unnamed: 0,model,dataset,accuracy,task,clip,itm,simclr,mlm
0,SimCLR,ImageNet,0.001280,zeroshot,False,False,True,False
1,SimCLR,Caltech101,0.007492,zeroshot,False,False,True,False
2,SimCLR,Caltech256,0.002491,zeroshot,False,False,True,False
3,SimCLR,CIFAR10,0.100000,zeroshot,False,False,True,False
4,SimCLR,CIFAR100,0.013140,zeroshot,False,False,True,False
...,...,...,...,...,...,...,...,...
359,SimCLR-ITM_1-aug,mscoco_captions,0.000200,retrieval,False,True,True,False
360,ITM-MLM_1-aug,flickr30k,0.000000,retrieval,False,True,False,True
361,CLIP-ITM_1-aug,flickr30k,0.436000,retrieval,True,True,False,False
362,SimCLR-ITM-MLM_1-aug,mscoco_captions,0.000400,retrieval,False,True,True,True


In [7]:
# ---------------------------------------------------
# 🔹 2. Extrahiere Loss-Komponenten
# ---------------------------------------------------
def extract_losses(model_name):
    losses = {
        "clip": "clip" in model_name.lower(),
        "itm": "itm" in model_name.lower(),
        "simclr": "simclr" in model_name.lower(),
        "mlm": "mlm" in model_name.lower()
    }
    return pd.Series(losses)

loss_features = df_all["model"].apply(extract_losses)
df_all = pd.concat([df_all, loss_features], axis=1)

In [10]:

# ---------------------------------------------------
# 🔹 3. Modelliere mit Mixed Effects
# ---------------------------------------------------
# Konvertiere zu Kategorien
df_all["dataset"] = df_all["dataset"].astype("category")
df_all["task"] = df_all["task"].astype("category")

# Modellformel: Accuracy ~ SimCLR + ITM + MLM + (1 | Dataset) + (1 | Task)
formula = "accuracy ~ clip + itm + simclr + mlm"
model = mixedlm(formula, data=df_all, groups=df_all["dataset"])
result = model.fit()

In [11]:
# ---------------------------------------------------
# 🔹 4. Ergebnisse anzeigen
# ---------------------------------------------------
print(result.summary())

          Mixed Linear Model Regression Results
Model:             MixedLM  Dependent Variable:  accuracy
No. Observations:  364      Method:              REML    
No. Groups:        17       Scale:               0.0461  
Min. group size:   13       Log-Likelihood:      10.2672 
Max. group size:   26       Converged:           Yes     
Mean group size:   21.4                                  
---------------------------------------------------------
               Coef.  Std.Err.   z    P>|z| [0.025 0.975]
---------------------------------------------------------
Intercept       0.196    0.053  3.657 0.000  0.091  0.300
clip[T.True]    0.240    0.025  9.740 0.000  0.192  0.289
itm[T.True]    -0.006    0.023 -0.260 0.795 -0.051  0.039
simclr[T.True]  0.008    0.025  0.311 0.756 -0.041  0.056
mlm[T.True]     0.016    0.023  0.694 0.488 -0.029  0.061
Group Var       0.028    0.051                           



In [12]:
import pandas as pd
import patsy
from statsmodels.regression.mixed_linear_model import MixedLM
import itertools

# --- Hilfsfunktion: Loss-Namen extrahieren ---
def extract_losses(model_name):
    model_name = model_name.lower()
    return {
        "clip": int("clip" in model_name),
        "itm": int("itm" in model_name),
        "simclr": int("simclr" in model_name),
        "mlm": int("mlm" in model_name),
    }


# --- CSVs laden ---
df_zero = pd.read_csv("/home/phisch/multimodal/test_results/model_scores_zero-shot.csv")
df_linear = pd.read_csv("/home/phisch/multimodal/test_results/model_scores_linear_probe.csv")
df_retrieval = pd.read_csv("/home/phisch/multimodal/test_results/model_scores_retrieval.csv")

# --- Filter anwenden ---
df_zero = df_zero[(df_zero["method_notes"] == "1_templates") & (df_zero["dataset_fraction"] == "1-aug")]
df_linear = df_linear[(df_linear["method_notes"] == "last_image_layer") & (df_linear["dataset_fraction"] == "1-aug")]
df_retrieval = df_retrieval[df_retrieval["model"].str.contains("1-aug")]

# --- Relevante Spalten umbenennen ---
df_zero["task"] = "zero_shot"
df_linear["task"] = "linear_probe"
df_retrieval["task"] = "retrieval"

df_zero["accuracy"] = df_zero["score"]
df_linear["accuracy"] = df_linear["score"]
df_retrieval["accuracy"] = df_retrieval["text_retrieval_recall@1"]

df_zero["model_str"] = df_zero["model_name"]
df_linear["model_str"] = df_linear["model_name"]
df_retrieval["model_str"] = df_retrieval["model"]

df_zero["dataset"] = df_zero["dataset"].astype(str)
df_linear["dataset"] = df_linear["dataset"].astype(str)
df_retrieval["dataset"] = df_retrieval["dataset"].astype(str)

# --- Nur notwendige Spalten behalten ---
columns_needed = ["accuracy", "task", "dataset", "model_str"]
df_all = pd.concat([df_zero[columns_needed], df_linear[columns_needed], df_retrieval[columns_needed]])

# --- Loss Features hinzufügen ---
loss_columns = ["clip", "itm", "simclr", "mlm"]
loss_df = df_all["model_str"].apply(extract_losses).apply(pd.Series)
df_all = pd.concat([df_all, loss_df], axis=1)

# --- Interaktionen, die in den Daten vorkommen ---
def get_existing_interactions(df, loss_names):
    interactions = []
    for i in range(2, len(loss_names)+1):
        for combo in itertools.combinations(loss_names, i):
            if (df[list(combo)].sum(axis=1) == len(combo)).any():
                interactions.append(":".join(combo))
    return interactions

# Interaktionen generieren
interactions = get_existing_interactions(df_all, loss_columns)
main_effects = loss_columns
formula = "accuracy ~ " + " + ".join(main_effects + interactions)

print("Verwendete Formel:", formula)

Verwendete Formel: accuracy ~ clip + itm + simclr + mlm + clip:itm + clip:simclr + clip:mlm + itm:simclr + itm:mlm + simclr:mlm + clip:itm:simclr + clip:itm:mlm + clip:simclr:mlm + itm:simclr:mlm + clip:itm:simclr:mlm


In [13]:
# --- Modell anpassen ---
model = MixedLM.from_formula(formula, groups=df_all["dataset"], data=df_all)
result = model.fit()
print(result.summary())

LinAlgError: Singular matrix

In [14]:
print(df_all['dataset'].value_counts())

dataset
OxfordIIITPet        78
Caltech256           78
CIFAR10              78
CIFAR100             78
DTD                  78
FGVCAircraft         78
Food101              78
Caltech101           78
Places365            78
StanfordCars         78
STL10                78
ImageNet-100-0.01    39
ImageNet             39
ImageNet-100-0.1     39
mscoco_captions      13
flickr8k             13
flickr30k            13
Name: count, dtype: int64


In [18]:
df_all.describe()

Unnamed: 0,accuracy,clip,itm,simclr,mlm
count,1014.0,1014.0,1014.0,1014.0,1014.0
mean,0.471504,0.615385,0.538462,0.615385,0.538462
std,0.336467,0.486744,0.498765,0.486744,0.498765
min,0.0,0.0,0.0,0.0,0.0
25%,0.138505,0.0,0.0,0.0,0.0
50%,0.465088,1.0,1.0,1.0,1.0
75%,0.787975,1.0,1.0,1.0,1.0
max,0.998875,1.0,1.0,1.0,1.0


In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df_all[['probe_acc', 'retrieval_acc', 'zeroshot_acc']].corr(), annot=True)
plt.show()


KeyError: "None of [Index(['probe_acc', 'retrieval_acc', 'zeroshot_acc'], dtype='object')] are in the [columns]"