In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pickle

# Paths
csv_path = r"C:\Users\Admin\Documents\rsna-pneumonia-detection-challenge\stage_2_train_labels.csv"

# Load data
data = pd.read_csv(csv_path)

num_target_1_before = (data["Target"] == 1).sum()
print(f"Number of rows where Target is 1 before grouping: {num_target_1_before}")

# Aggregate by image ID to avoid duplicates 
data_grouped = data.groupby("patientId")["Target"].max().reset_index()
data_grouped.to_csv("grouped_data.csv", index=False)


num_target_1_after = (data_grouped["Target"] == 1).sum()
print(f"Number of rows where Target is 1 after grouping: {num_target_1_after}")


# Apply Stratified K-Fold on unique images
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
splits = [(train_idx, val_idx) for train_idx, val_idx in skf.split(data_grouped.index, data_grouped["Target"])]

# Save fixed splits
path = r"G:\Meine Ablage\Universität\Master Thesis\Pneumonia\training\splits\splits_balanced_fix.pkl"
with open(path, "wb") as f:
    pickle.dump(splits, f)

print(f"KFold splits saved to {path}")

# Count number of samples per class
class_counts = data_grouped["Target"].value_counts()

# Print results
print("Class distribution:")
print(f"  No Lung Opacity (Target = 0): {class_counts.get(0, 0)}")
print(f"  Lung Opacity (Target = 1): {class_counts.get(1, 0)}")



Number of rows where Target is 1 before grouping: 9555
Number of rows where Target is 1 after grouping: 6012
KFold splits saved to G:\Meine Ablage\Universität\Master Thesis\Pneumonia\training\splits\splits_balanced_fix.pkl
Class distribution:
  No Lung Opacity (Target = 0): 20672
  Lung Opacity (Target = 1): 6012


In [3]:
import pickle

# Path to your old split file
path = r"G:\Meine Ablage\Universität\Master Thesis\Pneumonia\training\splits\splits_balanced_fix.pkl"

# Load the pickle file
with open(path, "rb") as f:
    old_splits = pickle.load(f)

# Print details of the splits
for fold, (train_idx, val_idx) in enumerate(old_splits):
    print(f"Fold {fold + 1}:")
    print(f"  Train indices (first 10): {train_idx[:20]}")
    print(f"  Validation indices (first 10): {val_idx[:20]}")
    print(f"  Total train: {len(train_idx)}, Total val: {len(val_idx)}\n")

Fold 1:
  Train indices (first 10): [ 0  1  2  3  4  5  8  9 10 11 12 13 15 16 17 18 19 20 21 23]
  Validation indices (first 10): [ 6  7 14 22 24 26 30 35 38 42 43 46 54 57 58 62 75 87 95 98]
  Total train: 21347, Total val: 5337

Fold 2:
  Train indices (first 10): [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
  Validation indices (first 10): [ 20  25  29  47  48  51  56  61  68  70  71  80 107 108 115 124 127 130
 134 142]
  Total train: 21347, Total val: 5337

Fold 3:
  Train indices (first 10): [ 0  2  4  5  6  7  9 10 12 13 14 15 19 20 21 22 23 24 25 26]
  Validation indices (first 10): [ 1  3  8 11 16 17 18 28 31 40 44 49 52 60 64 65 66 69 81 84]
  Total train: 21347, Total val: 5337

Fold 4:
  Train indices (first 10): [ 0  1  3  4  6  7  8  9 11 13 14 16 17 18 20 21 22 24 25 26]
  Validation indices (first 10): [ 2  5 10 12 15 19 23 33 36 50 55 59 63 72 74 77 79 83 89 90]
  Total train: 21347, Total val: 5337

Fold 5:
  Train indices (first 10): [ 1  2  3  5  6

In [8]:
import pickle
import pandas as pd

csv_path = r"C:\Users\Admin\Documents\rsna-pneumonia-detection-challenge\stage_2_train_labels.csv"

# Load data
data = pd.read_csv(csv_path)

# Aggregate by image ID to avoid duplicates 
data_grouped = data.groupby("patientId")["Target"].max().reset_index()

path = r"C:\Users\Admin\Downloads\splits_balanced_fix.pkl"
with open(path, "rb") as f:
    splits = pickle.load(f)

for i, (train_idx, val_idx) in enumerate(splits):
    train_data = data_grouped.iloc[train_idx]
    val_data = data_grouped.iloc[val_idx]

    train_counts = train_data["Target"].value_counts()
    val_counts = val_data["Target"].value_counts()

    print(f"\nSplit {i+1}:")
    print(f"  Training set - No Lung Opacity (Target=0): {train_counts.get(0, 0)}, Lung Opacity (Target=1): {train_counts.get(1, 0)}")
    print(f"  Validation set - No Lung Opacity (Target=0): {val_counts.get(0, 0)}, Lung Opacity (Target=1): {val_counts.get(1, 0)}")



Split 1:
  Training set - No Lung Opacity (Target=0): 16537, Lung Opacity (Target=1): 4810
  Validation set - No Lung Opacity (Target=0): 4135, Lung Opacity (Target=1): 1202

Split 2:
  Training set - No Lung Opacity (Target=0): 16537, Lung Opacity (Target=1): 4810
  Validation set - No Lung Opacity (Target=0): 4135, Lung Opacity (Target=1): 1202

Split 3:
  Training set - No Lung Opacity (Target=0): 16538, Lung Opacity (Target=1): 4809
  Validation set - No Lung Opacity (Target=0): 4134, Lung Opacity (Target=1): 1203

Split 4:
  Training set - No Lung Opacity (Target=0): 16538, Lung Opacity (Target=1): 4809
  Validation set - No Lung Opacity (Target=0): 4134, Lung Opacity (Target=1): 1203

Split 5:
  Training set - No Lung Opacity (Target=0): 16538, Lung Opacity (Target=1): 4810
  Validation set - No Lung Opacity (Target=0): 4134, Lung Opacity (Target=1): 1202


In [2]:
import pandas as pd

# Load the grouped data
csv_path = r"G:\Meine Ablage\Universität\Master Thesis\Pneumonia\training\grouped_data.csv"
data_grouped = pd.read_csv(csv_path)

# Count occurrences of each label
target_counts = data_grouped["Target"].value_counts()

# Print results
print("Label distribution in grouped_data.csv:")
print(f"No Lung Opacity (Target=0): {target_counts.get(0, 0)}")
print(f"Lung Opacity (Target=1): {target_counts.get(1, 0)}")

Label distribution in grouped_data.csv:
No Lung Opacity (Target=0): 20672
Lung Opacity (Target=1): 6012
