In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

# 1. wczytanie arkusza
df = pd.read_excel("wyniki_obrazow2.xlsx")          # 1 500 rekordów

# 2. utworzenie kategorii + rzeczywisty typ zdjęcia
def sprawdz_kategorie(fname, y):
    """Zwraca etykietę 'category_AI' lub 'category_real'."""
    cat = re.match(r"^[^_]+_([^_]+)_", fname).group(1).lower()
    return f"{cat}_{y}"          # np. 'animal_1' , 'food_0'

strat_key = df.apply(lambda r: sprawdz_kategorie(r["nazwa_obrazu"], r["czy_ai"]), axis=1)

# 3. 60% - zbiór treningowy  / 40% - do podziału
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.40, random_state=42)
train_idx, temp_idx = next(split1.split(df, strat_key))

train = df.iloc[train_idx].reset_index(drop=True)
temp  = df.iloc[temp_idx].reset_index(drop=True)
temp_key = strat_key.iloc[temp_idx].reset_index(drop=True)

# 4.  z pozostałych 40%  →  20% - zbiór walidacyjny / 20% - zbiór testowy
split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.50, random_state=42)
val_idx, test_idx = next(split2.split(temp, temp_key))

validation = temp.iloc[val_idx].reset_index(drop=True)
test       = temp.iloc[test_idx].reset_index(drop=True)

print(len(train), len(validation), len(test))   # 900 300 300

900 300 300


In [2]:
def weryfikacja_podzialu(name: str, frame: pd.DataFrame) -> pd.DataFrame:
    """
    Zwraca tabelę: kategoria  i  (AI / real)  z licznościami.
    """
    out = (
        frame
        .assign(kategoria=lambda x:
                x["nazwa_obrazu"].str.extract(r"^[^_]+_([^_]+)_")[0].str.lower())
        .pivot_table(index="kategoria", columns="czy_ai",
                     values="nazwa_obrazu", aggfunc="count")
        .rename(columns={0: "real", 1: "AI"})
        .fillna(0).astype(int)
    )
    out["Łącznie"] = out["AI"] + out["real"]
    print(f"\n=== {name.upper()} ({len(frame)} obrazów) ===")
    display(out)                      # w Jupyterze
    return out

train_tbl = weryfikacja_podzialu("Zbiór treningowy", train)
val_tbl   = weryfikacja_podzialu("Zbiór walidacyjny", validation)
test_tbl  = weryfikacja_podzialu("Zbiór testowy", test)

#Sprawdzenie
total_tbl = train_tbl.add(val_tbl, fill_value=0).add(test_tbl, fill_value=0)
print("\n=== SUMA GLOBALNA ===")
display(total_tbl)


=== ZBIÓR TRENINGOWY (900 obrazów) ===


czy_ai,real,AI,Łącznie
kategoria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
animal,150,150,300
building,150,150,300
food,150,150,300



=== ZBIÓR WALIDACYJNY (300 obrazów) ===


czy_ai,real,AI,Łącznie
kategoria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
animal,50,50,100
building,50,50,100
food,50,50,100



=== ZBIÓR TESTOWY (300 obrazów) ===


czy_ai,real,AI,Łącznie
kategoria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
animal,50,50,100
building,50,50,100
food,50,50,100



=== SUMA GLOBALNA ===


czy_ai,real,AI,Łącznie
kategoria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
animal,250,250,500
building,250,250,500
food,250,250,500


### 2. WAGI GLOBALNE KAŻDEGO Z MODELI BAZOWYCH

In [3]:
# Słownik modeli
modele = {
    "AI Image Detector":  "ai_image_detector",
    "The Detector AI":    "the_detector_ai",
    "Ai Detect Content":  "ai_detect_content"
}

# Testowanie progów decyzjnych
thresholds = np.arange(0.3, 0.71, 0.05)

# Dla każdego modelu najlepszy próg na zbiorze walidacyjnym
best = {}
for name, col in modele.items():
    best_T, best_f1 = 0.5, 0
    for T in thresholds:
        preds = (validation[col] >= T).astype(int)
        f1 = f1_score(validation["czy_ai"], preds)
        if f1 > best_f1:
            best_f1, best_T = f1, T
    best[name] = {"T": best_T, "F1": best_f1}

# Ustalanie wag proporcjonalne do tych najlepszych F1
f1_vals = np.array([best[name]["F1"] for name in modele])
w_global = f1_vals / f1_vals.sum()

# Podsumowanie
table = pd.DataFrame({
    "Model bazowy": list(modele.keys()),
    "Best T":       [best[name]["T"] for name in modele],
    "F1_score":     [round(best[name]["F1"],3) for name in modele],
    "w_i":          [round(w,3) for w in w_global]
})
table

Unnamed: 0,Model bazowy,Best T,F1_score,w_i
0,AI Image Detector,0.3,0.619,0.309
1,The Detector AI,0.7,0.732,0.365
2,Ai Detect Content,0.3,0.652,0.326


### 3. WAGI LOKALNE KAŻDEJ KATEGORI DLA KAŻDEGO Z MODELI BAZOWYCH

In [4]:
# Lista kategorii zdjec
kategorie = ["building", "animal", "food"]

# Wyciagniecie kategorii zdjęcia na podstawie informacji w nazwie pliku
validation['kategoria'] = validation['nazwa_obrazu'] \
    .str.extract(r'^[^_]+_([^_]+)_', expand=False) \
    .str.lower()

wyniki = []

# pętla po kategoriach
for kat in kategorie:
    # wyciąga podzbiór walidacyjny dla tej kategorii
    sub = validation[validation["kategoria"] == kat]
    y_true = sub["czy_ai"].values
    
    # Dla każdego modelu szuka najlepszego T i F1
    best = {}
    for name, col in modele.items():
        best_T, best_f1 = 0.5, -1
        scores = sub[col].values
        
        for T in thresholds:
            preds = (scores >= T).astype(int)
            f1 = f1_score(y_true, preds)
            if f1 > best_f1:
                best_f1, best_T = f1, T
        
        best[name] = {"Prog T": best_T, "F1_score": best_f1}
    
    # Wyciąga F1-score do normalizacji
    f1_vals = np.array([best[m]["F1_score"] for m in modele])
    w_cat   = f1_vals / f1_vals.sum()
    
    # Dopisuje do wyników tabelę dla tej kategorii
    for i, name in enumerate(modele):
        wyniki.append({
            "Kategoria":    kat,
            "Model bazowy": name,
            "Prog T":      round(best[name]["Prog T"], 3),
            "F1_score":    round(best[name]["F1_score"], 3),
            "w_i":         round(w_cat[i], 3)
        })

# Wynik do formatu pandas DataFrame
df_wagi_per_kat = pd.DataFrame(wyniki)

# Ustawia kategorie jako wielopoziomowy indeks (opcjonalnie)
df_wagi_per_kat = df_wagi_per_kat.set_index(["Kategoria", "Model bazowy"])

print(df_wagi_per_kat)

                             Prog T  F1_score    w_i
Kategoria Model bazowy                              
building  AI Image Detector    0.30     0.623  0.302
          The Detector AI      0.70     0.784  0.380
          Ai Detect Content    0.30     0.658  0.318
animal    AI Image Detector    0.30     0.614  0.300
          The Detector AI      0.65     0.717  0.350
          Ai Detect Content    0.70     0.718  0.351
food      AI Image Detector    0.30     0.620  0.313
          The Detector AI      0.30     0.707  0.357
          Ai Detect Content    0.30     0.652  0.329


### 4. OCENA POSZCZEGÓLNYCH MODELI NA CALYM ZBIORZE

In [5]:
# Funkcja pomocnicza do oceny skuteczności modelu.
# threshold -> próg, powyżej którego model uznaje obraz za AI.
# y_true -> rzeczywiste etykiety (1 = AI, 0 = real)
# y_scores -> prawdopodobieństwa z modelu (np. [0.95, 0.1, 0.7, ...])
def ewaluacja_modelu(y_true, y_scores, threshold=0.5):
    # Konwertujemy predykcje do 0/1 (AI/real) na podstawie threshold
    y_pred = (y_scores >= threshold).astype(int)
    
    return {
        "Prog T": threshold,
        "Dokladnosc": accuracy_score(y_true, y_pred),
        "Precyzja": precision_score(y_true, y_pred, zero_division=0),
        "Czulosc": recall_score(y_true, y_pred, zero_division=0),
        "F1_score": f1_score(y_true, y_pred, zero_division=0),
        "TN": confusion_matrix(y_true, y_pred)[0,0],
        "FP": confusion_matrix(y_true, y_pred)[0,1],
        "FN": confusion_matrix(y_true, y_pred)[1,0],
        "TP": confusion_matrix(y_true, y_pred)[1,1],
    }

thresholds = np.arange(0.3, 0.71, 0.05)

# Wyniki zapisywane sa w liscie
all_results = []

y_true = df["czy_ai"].values

for model_name, col in modele.items():
    scores = df[col].values
    for T in thresholds:
        res = ewaluacja_modelu(y_true, scores, threshold=T)
        res["model"] = model_name
        all_results.append(res)

# Przekształca do formatu pandas DataFrame
results_df = pd.DataFrame(all_results)

# Dla każdego modelu wybiera T dające największe F1
best_per_model = results_df.loc[results_df.groupby("model")["F1_score"].idxmax()] \
    .sort_values("model") \
    .reset_index(drop=True)

print(best_per_model[[
    "model", "Prog T", "Dokladnosc", "Precyzja", "Czulosc", "F1_score", "TN","FP","FN","TP"
]])


               model  Prog T  Dokladnosc  Precyzja   Czulosc  F1_score   TN  \
0  AI Image Detector    0.30    0.493333  0.496006  0.828000  0.620380  119   
1  Ai Detect Content    0.35    0.538000  0.521806  0.909333  0.663102  125   
2    The Detector AI    0.70    0.722000  0.659636  0.917333  0.767429  395   

    FP   FN   TP  
0  631  129  621  
1  625   68  682  
2  355   62  688  


In [6]:
# Funkcja pomocnicza do oceny skuteczności modelu.
# threshold -> próg, powyżej którego model uznaje obraz za AI.
# y_true -> rzeczywiste etykiety (1 = AI, 0 = real)
# y_scores -> prawdopodobieństwa z modelu (np. [0.95, 0.1, 0.7, ...])
def ewaluacja_modelu(y_true, y_scores, threshold=0.5):
    # Konwertujemy predykcje do 0/1 (AI/real) na podstawie threshold
    y_pred = (y_scores >= 0.5).astype(int)
    
    return {
        "Prog T": threshold,
        "Dokladnosc": accuracy_score(y_true, y_pred),
        "Precyzja": precision_score(y_true, y_pred, zero_division=0),
        "Czulosc": recall_score(y_true, y_pred, zero_division=0),
        "F1_score": f1_score(y_true, y_pred, zero_division=0),
        "TN": confusion_matrix(y_true, y_pred)[0,0],
        "FP": confusion_matrix(y_true, y_pred)[0,1],
        "FN": confusion_matrix(y_true, y_pred)[1,0],
        "TP": confusion_matrix(y_true, y_pred)[1,1],
    }

all_results = []

y_true = df["czy_ai"].values

for model_name, col in modele.items():
    scores = df[col].values
    res = ewaluacja_modelu(y_true, scores)
    res["model"] = model_name
    all_results.append(res)

# Przekształca do formatu pandas DataFrame
results_df = pd.DataFrame(all_results)

# Dla każdego modelu wybiera T dające największe F1
best_per_model = results_df.loc[results_df.groupby("model")["F1_score"].idxmax()] \
    .sort_values("model") \
    .reset_index(drop=True)

print(best_per_model[[
    "model", "Prog T", "Dokladnosc", "Precyzja", "Czulosc", "F1_score", "TN","FP","FN","TP"
]])


               model  Prog T  Dokladnosc  Precyzja   Czulosc  F1_score   TN  \
0  AI Image Detector     0.5    0.490000  0.489712  0.476000  0.482759  378   
1  Ai Detect Content     0.5    0.541333  0.524603  0.881333  0.657711  151   
2    The Detector AI     0.5    0.710667  0.648218  0.921333  0.761013  375   

    FP   FN   TP  
0  372  393  357  
1  599   89  661  
2  375   59  691  


In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Wczytanie danych
df = pd.read_excel("wyniki_obrazow2.xlsx")

# Ekstrakcja kategorii
df['category'] = df['nazwa_obrazu'].str.split('_').str[1]

# Funkcja oceny 
def evaluate_model(y_true, y_scores, threshold=0.5):
    y_pred = (y_scores >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "threshold": threshold,
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
        "tn": tn, "fp": fp, "fn": fn, "tp": tp
    }

# Modele i zakres progów
models = {
    "AI Image Detector": "ai_image_detector",
    "The Detector AI":   "the_detector_ai",
    "AI Detect Content": "ai_detect_content"
}
thresholds = np.arange(0.3, 0.71, 0.05)

all_results = []

# Pętla po kategoriach i modelach
for cat in ["building", "animal", "food"]:
    sub = df[df['category'] == cat]
    y_true = sub['czy_ai'].values
    for model_name, col in models.items():
        scores = sub[col].values
        for T in thresholds:
            res = evaluate_model(y_true, scores, threshold=T)
            res.update({"model": model_name, "category": cat})
            all_results.append(res)

# Przekształcenie do formatu pandas DataFrame
results_df = pd.DataFrame(all_results)

# Dla każdej pary (kategoria, model) wybiera najlepszy próg wg. f1_score
idx = results_df.groupby(["category", "model"])["f1_score"].idxmax()
best_df = results_df.loc[idx] \
    .sort_values(["category", "model"]) \
    .reset_index(drop=True)

best_df

Unnamed: 0,threshold,accuracy,precision,recall,f1_score,tn,fp,fn,tp,model,category
0,0.6,0.65,0.608069,0.844,0.706868,114,136,39,211,AI Detect Content,animal
1,0.3,0.488,0.492788,0.82,0.615616,39,211,45,205,AI Image Detector,animal
2,0.7,0.738,0.700337,0.832,0.760512,161,89,42,208,The Detector AI,animal
3,0.35,0.524,0.513636,0.904,0.655072,36,214,24,226,AI Detect Content,building
4,0.3,0.494,0.496503,0.852,0.627393,34,216,37,213,AI Image Detector,building
5,0.7,0.764,0.683333,0.984,0.806557,136,114,4,246,The Detector AI,building
6,0.3,0.526,0.515222,0.88,0.649926,43,207,30,220,AI Detect Content,food
7,0.3,0.498,0.498771,0.812,0.61796,46,204,47,203,AI Image Detector,food
8,0.7,0.664,0.606218,0.936,0.735849,98,152,16,234,The Detector AI,food


In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Wczytanie danych
df = pd.read_excel("wyniki_obrazow2.xlsx")

# Ekstrakcja kategorii
df['category'] = df['nazwa_obrazu'].str.split('_').str[1]

# Funkcja oceny pod prog
def evaluate_model(y_true, y_scores, threshold=0.5):
    y_pred = (y_scores >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "threshold": threshold,
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
        "tn": tn, "fp": fp, "fn": fn, "tp": tp
    }

# Modele i zakres progów
models = {
    "AI Image Detector": "ai_image_detector",
    "The Detector AI":   "the_detector_ai",
    "AI Detect Content": "ai_detect_content"
}

all_results = []

# Pętla po kategoriach i modelach
for cat in ["building", "animal", "food"]:
    sub = df[df['category'] == cat]
    y_true = sub['czy_ai'].values
    for model_name, col in models.items():
        scores = sub[col].values
        res = evaluate_model(y_true, scores)
        res.update({"model": model_name, "category": cat})
        all_results.append(res)

# Przekształcenie do formatu pandas DataFrame
results_df = pd.DataFrame(all_results)

# Dla każdej pary (kategoria, model) wybiera najlepszy próg wg. f1_score
idx = results_df.groupby(["category", "model"])["f1_score"].idxmax()
best_df = results_df.loc[idx] \
    .sort_values(["category", "model"]) \
    .reset_index(drop=True)
best_df

Unnamed: 0,threshold,accuracy,precision,recall,f1_score,tn,fp,fn,tp,model,category
0,0.5,0.578,0.544828,0.948,0.691971,52,198,13,237,AI Detect Content,animal
1,0.5,0.502,0.501931,0.52,0.510806,121,129,120,130,AI Image Detector,animal
2,0.5,0.712,0.66879,0.84,0.744681,146,104,40,210,The Detector AI,animal
3,0.5,0.52,0.511682,0.876,0.646018,41,209,31,219,AI Detect Content,building
4,0.5,0.48,0.479339,0.464,0.471545,124,126,134,116,AI Image Detector,building
5,0.5,0.758,0.677686,0.984,0.80261,133,117,4,246,The Detector AI,building
6,0.5,0.526,0.516373,0.82,0.633694,58,192,45,205,AI Detect Content,food
7,0.5,0.488,0.486842,0.444,0.464435,133,117,139,111,AI Image Detector,food
8,0.5,0.662,0.604113,0.94,0.735524,96,154,15,235,The Detector AI,food


### 5. KOMITET MODELI

In [9]:
modele = {
    'ai_image_detector': 'AI Image Detector',
    'the_detector_ai':   'The Detector AI',
    'ai_detect_content': 'Ai Detect Content'
}

#Dodanie kolumny z kategoria zdjecia do TRAIN/VALIDATION/TEST
for df in [train, validation, test]:
    df['kategoria'] = df['nazwa_obrazu'].str.split('_').str[1]

In [10]:
def k1(df, df_wagi):
    """
    Dla każdego wiersza:
      - dla każdego modelu bazowego a,b,c pobierz lokalny próg T
      - zrób predykcje = (prawdopodobienstwo_m >= T).astype(int)
      - decyzja SI jeśli conajmniej 2 z 3 pred == 1
    """
    preds = []
    for _, row in df.iterrows():
        cat = row['kategoria']
        votes = []
        for col, model in modele.items():
            T_loc = df_wagi.loc[(cat, model), 'Prog T']
            votes.append(int(row[col] >= T_loc))
        preds.append(int(sum(votes) >= 2))
    return pd.Series(preds, index=df.index)



test['pred_k1'] = k1(test, df_wagi_per_kat)

In [11]:
def k2(df, df_wagi, prog=0.5):
    """
    Dla każdego wiersza:
      - pobiera wagę w_i z df_wagi
      - oblicza P_K2 = sum(w_i * p_m)
      - zwraca 1, jeśli P_K2 >= prog, w przeciwnym razie 0
    """
    preds = []
    for _, row in df.iterrows():
        cat = row['kategoria']
        P_K2 = sum(
            df_wagi.loc[(cat, model), 'w_i'] * row[col]
            for col, model in modele.items()
        )
        preds.append(int(P_K2 >= prog))
    return pd.Series(preds, index=df.index)

# Użycie na zbiorze testowym:
test['pred_k2'] = k2(test, df_wagi_per_kat, prog=0.5)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

feature_columns = ['ai_image_detector', 'the_detector_ai', 'ai_detect_content']
param_grid       = {'C': [0.01, 0.1, 1, 10, 100]}

# 1. Strojenie C na validation
grid = GridSearchCV(
    LogisticRegression(penalty='l2', solver='liblinear', max_iter=2000),
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)
grid.fit(validation[feature_columns], validation['czy_ai'])
best_C = grid.best_params_['C']

# 2. Finalny trening na train
meta_clf = LogisticRegression(penalty='l2', C=best_C, solver='liblinear', max_iter=2000)
meta_clf.fit(train[feature_columns], train['czy_ai'])

# 3. Predykcje na test
test['pred_k3'] = meta_clf.predict(test[feature_columns])

In [20]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, confusion_matrix
)
from statsmodels.stats.contingency_tables import mcnemar

# 1) Zbuduj słownik z predykcjami i etykietami
methods = {
    'AI Image Detector':   test.apply(lambda r: int(r['ai_image_detector']  >= df_wagi_per_kat.loc[(r['kategoria'],'AI Image Detector'),   'Prog T']), axis=1),
    'The Detector AI':     test.apply(lambda r: int(r['the_detector_ai']    >= df_wagi_per_kat.loc[(r['kategoria'],'The Detector AI'),     'Prog T']), axis=1),
    'Ai Detect Content':   test.apply(lambda r: int(r['ai_detect_content'] >= df_wagi_per_kat.loc[(r['kategoria'],'Ai Detect Content'),   'Prog T']), axis=1),
    'K1 Majority Vote':    test['pred_k1'],
    'K2 Weighted Avg':     test['pred_k2'],
    'K3 Stacking':         test['pred_k3']
}

y_true = test['czy_ai']

# 2) Oblicz metryki i złóż w tabelę
rows = []
for name, preds in methods.items():
    tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()
    rows.append({
        'Metoda': name,
        'Dokładność': accuracy_score(y_true, preds),
        'Precyzja': precision_score(y_true, preds, zero_division=0),
        'Czułość': recall_score(y_true, preds, zero_division=0),
        'F1-score': f1_score(y_true, preds, zero_division=0),
        'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp
    })
df_metrics = pd.DataFrame(rows).set_index('Metoda').sort_values('F1-score', ascending=False)
df_metrics

Unnamed: 0_level_0,Dokładność,Precyzja,Czułość,F1-score,TN,FP,FN,TP
Metoda,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
K2 Weighted Avg,0.74,0.668224,0.953333,0.785714,79,71,7,143
The Detector AI,0.74,0.673077,0.933333,0.782123,82,68,10,140
K3 Stacking,0.756667,0.710383,0.866667,0.780781,97,53,20,130
K1 Majority Vote,0.61,0.565217,0.953333,0.709677,40,110,7,143
Ai Detect Content,0.583333,0.557604,0.806667,0.659401,54,96,29,121
AI Image Detector,0.49,0.49434,0.873333,0.631325,16,134,19,131


In [13]:
import pickle
with open("meta_clf.pkl","wb") as f:
    pickle.dump(meta_clf, f)

df_wagi_per_kat.to_pickle("df_wagi_per_kat.pkl")