In [17]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

df_missing_gender_unk = df[
    (df['gender'].isna() | (df['gender'] == "")) & 
    df.apply(lambda row: row.astype(str).str.contains("UNK", case=False).any(), axis=1)
]

print(df_missing_gender_unk)

# Step 1: Identify rows that contain 'UNK' in any column
df_with_unk = df[df.apply(lambda row: row.astype(str).str.contains("UNK", case=False).any(), axis=1)]

# Step 2: From those rows, filter only the ones where 'gender' is NOT empty
df_with_unk_valid_gender = df_with_unk[df_with_unk['gender'].notna() & (df_with_unk['gender'] != "")]

# Step 3: Count and display results
valid_count = len(df_with_unk_valid_gender)

print(f"Number of rows that contain 'UNK' but have non-empty gender: {valid_count}")

# Step 1: Remove rows where 'gender' is missing or empty
df_filtered = df[df['gender'].notna() & (df['gender'] != "")]

# Step 2: Remove rows where any column contains 'UNK'
df_filtered = df_filtered[~df_filtered.apply(lambda row: row.astype(str).str.contains("UNK", case=False).any(), axis=1)]

# Step 3: Get the count of valid rows
valid_row_count = len(df_filtered)

print(f"Number of rows with non-empty gender and no 'UNK': {valid_row_count}")

# Step 4: Count rows where 'gender' is empty, but do NOT contain 'UNK' anywhere else
df_missing_gender = df[df['gender'].isna() | (df['gender'] == "")]
df_missing_gender_clean = df_missing_gender[~df_missing_gender.apply(lambda row: row.astype(str).str.contains("UNK", case=False).any(), axis=1)]

print(f"Number of rows with empty gender and no 'UNK': {len(df_missing_gender_clean)}")

# Step 1: Remove rows where 'gender' is missing or empty
df_filtered = df[df['pesticide'].notna() & (df['pesticide'] != "")]

# Step 2: Remove rows where any column contains 'UNK'
df_filtered = df_filtered[~df_filtered.apply(lambda row: row.astype(str).str.contains("UNK", case=False).any(), axis=1)]

# Step 3: Get the count of valid rows
valid_row_count = len(df_filtered)

print(f"Number of rows with non-empty gender and no 'UNK': {valid_row_count}")

### **Step 1: Inspect Basic Information** ###
print(df.info())  # Overview of dataset structure
print("\nMissing Values Count:\n", df.isnull().sum())  # Count missing data for each feature

### **Step 2: Summary Statistics for Numerical Features** ###
print("\nSummary Statistics:\n", df.describe())  # View numerical feature distributions

### **Step 3: Distribution of Categorical Features** ###
binary_cols = ['smoke', 'drink', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
print("\nCategorical Feature Counts:")
for col in binary_cols:
    print(f"{col}:\n{df[col].value_counts(dropna=False)}\n")

### **Step 4: Check Target Variable (`biopsed`) Distribution** ###
print("\nTarget Variable Distribution:\n", df['biopsed'].value_counts(normalize=True))

### **Step 5: Convert `biopsed` to Numeric for Correlation Analysis** ###
df['biopsed'] = df['biopsed'].astype(int)  # Convert boolean to integer (1 = True, 0 = False)

### **Step 6: Analyze Correlations (Only Numeric Features)** ###
numeric_cols = df.select_dtypes(include=[np.number]).columns
print("\nFeature Correlations:\n", df[numeric_cols].corr()['biopsed'].sort_values(ascending=False))

### **Step 7: Missing Data Analysis** ###
print("\nMissing Value Counts:")
print(df.isnull().sum())

# Check how many rows have missing values in key columns
missing_cols = ['smoke', 'drink', 'pesticide', 'gender', 'skin_cancer_history', 'cancer_history',
                'has_piped_water', 'has_sewage_system', 'fitspatrick', 'diameter_1', 'diameter_2']
missing_rows = df[missing_cols].isnull().any(axis=1).sum()
print(f"\nTotal rows with missing values in key columns: {missing_rows}")

# Inspect smoking data separately to confirm how missing values behave
print("\nSmoking Data Distribution:\n", df['smoke'].value_counts(dropna=False))


Empty DataFrame
Columns: [patient_id, lesion_id, smoke, drink, background_father, background_mother, age, pesticide, gender, skin_cancer_history, cancer_history, has_piped_water, has_sewage_system, fitspatrick, region, diameter_1, diameter_2, diagnostic, itch, grew, hurt, changed, bleed, elevation, img_id, biopsed]
Index: []

[0 rows x 26 columns]
Number of rows that contain 'UNK' but have non-empty gender: 484
Number of rows with non-empty gender and no 'UNK': 1010
Number of rows with empty gender and no 'UNK': 804
Number of rows with non-empty gender and no 'UNK': 1010
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2298 entries, 0 to 2297
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   patient_id           2298 non-null   object 
 1   lesion_id            2298 non-null   int64  
 2   smoke                1494 non-null   object 
 3   drink                1494 non-null   object 
 4   background

In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load dataset
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

print(f"Total rows with missing 'smoke': {df['smoke'].isnull().sum()}")
print(f"Total rows with missing 'smoke' and 'biopsed' is FALSE: {df[df['smoke'].isnull() & (df['biopsed'] == 0)].shape[0]}")

### **DATA PREPROCESSING** ###

# List of categorical features
binary_cols = ['smoke', 'drink', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
numeric_cols = ['diameter_1', 'diameter_2', 'age']

# Convert 'UNK' to NaN, then replace True/False with 1/0
df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float)

# Fill missing values in binary features with 0.5 (for initial test)
df[binary_cols] = df[binary_cols].fillna(0.5)

# Fill missing values for numerical features using MEDIAN (to prevent outliers from impacting the data)
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

# Convert `biopsed` to numeric (1 = True, 0 = False)
df['biopsed'] = df['biopsed'].astype(int)

# Select features
features = binary_cols + numeric_cols
X = df[features].copy()
y = df['biopsed']

# Standardize ONLY numerical features (`diameter_1`, `diameter_2`, `age`)
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

### **TRAINING LOGISTIC REGRESSION MODEL** ###
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate Accuracy, Precision, and Recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)

print(f"\nModel Accuracy (0.5 for missing binary features): {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

### **FEATURE IMPORTANCE** ###
feature_importance = np.exp(model.coef_[0])  # Convert to odds ratios
print("\nFeature Importance (Odds Ratios):")
for feature, importance in zip(X.columns, feature_importance):
    print(f"{feature}: {importance:.2f}")

### **CROSS-VALIDATION** ###
kf = KFold(n_splits=5, shuffle=True, random_state=None)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

print(f"\nCross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")

### **MISSING DATA ANALYSIS** ###
print("\nFinal Missing Value Counts (After Handling):")
print(df.isnull().sum())

# Check total rows with missing values in key columns
missing_cols = ['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 'has_piped_water', 'has_sewage_system', 
                'fitspatrick', 'diameter_1', 'diameter_2']
missing_rows = df[missing_cols].isnull().any(axis=1).sum()

print(f"\nTotal rows with missing values in key columns: {missing_rows}")

# Inspect smoking data distribution after imputation
print("\nSmoking Data Distribution (After Imputation):\n", df['smoke'].value_counts())


Total rows with missing 'smoke': 804
Total rows with missing 'smoke' and 'biopsed' is FALSE: 804

Model Accuracy (0.5 for missing binary features): 0.87
Precision: 0.92
Recall: 0.85

Feature Importance (Odds Ratios):
smoke: 0.09
drink: 0.37
itch: 1.24
grew: 1.74
hurt: 6.18
changed: 20.05
bleed: 4.71
elevation: 2.78
diameter_1: 1.19
diameter_2: 1.08
age: 1.41

Cross-Validation Accuracy Scores: [0.86086957 0.85434783 0.83478261 0.87799564 0.83224401]
Mean Accuracy: 0.85

Final Missing Value Counts (After Handling):
patient_id               0
lesion_id                0
smoke                    0
drink                    0
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1               0
diameter_2               0
diagnostic            

  df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float)


### Splitting the dataset to try to fix the bias of the 804 rows that miss values, and are all biopsed FALSE, which probably would cause issues for our training

In [94]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load dataset (adjust file path as needed)
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

# ================================
# STEP 1: Preprocess Important Features
# ================================

# Define important binary features (we use these for the model)
binary_cols = ['smoke', 'drink', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan})  # Convert to numeric
df[binary_cols] = df[binary_cols].astype(float).fillna(0.5)  # Impute missing values with 0.5

# Process the diameter features
diameter_cols = ['diameter_1', 'diameter_2']
df[diameter_cols] = df[diameter_cols].apply(lambda col: col.fillna(col.median()))  # Median imputation
df['diameter'] = df[diameter_cols].mean(axis=1)  # Create averaged diameter feature

# Convert target variable 'biopsed' to numeric (True → 1, False → 0)
df['biopsed'] = df['biopsed'].astype(int)

# ================================
# STEP 2: Use Complete Cases for Modeling
# ================================

# Define key columns that must be present
cols_to_clean = ['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                 'has_piped_water', 'has_sewage_system', 'fitspatrick', 
                 'diameter_1', 'diameter_2']

# Drop rows missing values in key columns
df_clean = df.dropna(subset=cols_to_clean)

# Feature matrix & target variable
features = binary_cols + ['diameter']
X = df_clean[features].copy()
y = df_clean['biopsed']

# ================================
# STEP 3: Standardize Numerical Features
# ================================

scaler = StandardScaler()
X[['diameter']] = scaler.fit_transform(X[['diameter']])

# ================================
# STEP 4: Training and Evaluation
# ================================

model = LogisticRegression()
model.fit(X, y)

# Model Predictions
y_pred = model.predict(X)

# Accuracy Score
accuracy = accuracy_score(y, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Precision Score
precision = precision_score(y, y_pred, zero_division=1)
print(f"Precision: {precision:.2f}")

# Recall Score
recall = recall_score(y, y_pred, zero_division=1)
print(f"Recall: {recall:.2f}")

# Perform Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=None)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print(f"Cross-Validation Mean Accuracy: {cv_scores.mean():.2f}")

# ================================
# STEP 5: Feature Importance (Odds Ratios)
# ================================

odds_ratios = np.exp(model.coef_[0])
print("\nFeature Importance (Odds Ratios):")
for feat, ratio in zip(features, odds_ratios):
    print(f"{feat}: {ratio:.2f}")

# ================================
# STEP 6: Train–Test Split & Evaluation on Hold-Out Set
# ================================

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
model.fit(X_train, y_train)

# Predictions on test set
y_pred_test = model.predict(X_test)

# Accuracy Score
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Precision Score (on test set)
precision_test = precision_score(y_test, y_pred_test, zero_division=1)
print(f"Test Precision: {precision_test:.2f}")

# Recall Score (on test set)
recall_test = recall_score(y_test, y_pred_test, zero_division=1)
print(f"Test Recall: {recall_test:.2f}")

# ================================
# STEP 7: Additional Missing Value Check
# ================================

print("\nMissing Value Counts in Original DataFrame:")
print(df.isnull().sum())

Model Accuracy: 0.90
Precision: 0.90
Recall: 1.00
Cross-Validation Mean Accuracy: 0.89

Feature Importance (Odds Ratios):
smoke: 2.11
drink: 0.80
itch: 1.10
grew: 1.73
hurt: 4.98
changed: 18.92
bleed: 2.45
elevation: 2.72
diameter: 1.24
Test Accuracy: 0.88
Test Precision: 0.88
Test Recall: 1.00

Missing Value Counts in Original DataFrame:
patient_id               0
lesion_id                0
smoke                    0
drink                    0
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1               0
diameter_2               0
diagnostic               0
itch                     0
grew                     0
hurt                     0
changed                  0
bleed                    0
elevation                0
img_id     

  df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan})  # Convert to numeric


# **Refining the Model by Selecting Key Features**

### **Selected Features**
Based on feature importance analysis, we focus on high-impact predictors:

| Feature   | Odds Ratio (First Model) | Odds Ratio (Full Model) |
|-----------|-------------------------|-------------------------|
| **Changed**   | 23.85 | 17.89 |
| **Hurt**      | 7.86  | 5.08  |
| **Bleed**     | 4.06  | 2.44  |
| **Elevation** | 2.94  | 2.89  |
| **Diameter**  | 1.24  | 1.26  |
| **Age**       | 1.34  | 0.82  |

### **Next Steps**
1️⃣ **Re-train the model using only the above features**  
2️⃣ **Compare accuracy with previous versions**  
3️⃣ **Assess if simplifying the model improves generalization**


### We split the data into two models before, because for the same 804 entries , we had a lot of missing values. But now that we keep the important features , only diameter has missing values so we ll fix that by using the median for the missing entries

In [65]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load dataset (adjust file path as needed)
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

# ================================
# STEP 1: Preprocess Important Features
# ================================

# Define important binary features (we use these for the model)
important_binary = ['changed', 'hurt', 'bleed', 'elevation']
# Replace string values: 'True' becomes 1, 'False' becomes 0, and 'UNK' becomes NaN.
df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})
# Ensure they are floats, then fill missing values with 0.5
df[important_binary] = df[important_binary].astype(float).fillna(0.5)

# Process the diameter features.
diameter_cols = ['diameter_1', 'diameter_2']
# Impute missing values with the median
df[diameter_cols] = df[diameter_cols].apply(lambda col: col.fillna(col.median()))
# Create a new combined 'diameter' feature as the average
df['diameter'] = df[diameter_cols].mean(axis=1)

# Convert the target variable 'biopsed' to numeric (True → 1, False → 0)
df['biopsed'] = df['biopsed'].astype(int)

# ================================
# STEP 2: Define Feature Set and Prepare Data
# ================================

# Using the four binary features + the combined diameter feature
features = important_binary + ['diameter']

# Feature matrix and target variable
X = df[features].copy()
y = df['biopsed']

# Standardize the continuous feature "diameter"
scaler = StandardScaler()
X[['diameter']] = scaler.fit_transform(X[['diameter']])

# ================================
# STEP 3: Train/Test Split (80/20)
# ================================

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ================================
# STEP 4: Training and Evaluation
# ================================

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Accuracy Score
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Precision Score
precision = precision_score(y_test, y_pred, zero_division=1)
print("Precision:", precision)

# Recall Score
recall = recall_score(y_test, y_pred, zero_division=1)
print("Recall:", recall)

# Perform cross-validation on the entire dataset:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print("Cross-Validation Mean Accuracy:", cv_scores.mean())

# ================================
# STEP 5: Feature Importance (Odds Ratios)
# ================================

odds_ratios = np.exp(model.coef_[0])
print("\nFeature Importance (Odds Ratios):")
for feat, ratio in zip(features, odds_ratios):
    print(f"{feat}: {ratio:.2f}")


Test Accuracy: 0.8173913043478261
Precision: 0.9090909090909091
Recall: 0.7801418439716312
Cross-Validation Mean Accuracy: 0.8046045278014589

Feature Importance (Odds Ratios):
changed: 27.39
hurt: 6.71
bleed: 5.52
elevation: 3.28
diameter: 1.31


  df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})


### And a final training using only the 4 features that seem to be the most important

In [60]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

# ---------------------------
# Load dataset (update the path as needed)
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")
# ---------------------------

# STEP 1: Preprocessing for the Important Binary Features
important_binary = ['changed', 'hurt', 'bleed', 'elevation']

# Replace string entries (True → 1, False → 0, 'UNK' → NaN)
df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})
df[important_binary] = df[important_binary].astype(float).fillna(0.5)  # Impute missing values with 0.5

# Convert target variable 'biopsed' to numeric (True → 1, False → 0)
df['biopsed'] = df['biopsed'].astype(int)

# STEP 2: Define Features & Prepare Dataset
features = important_binary
X = df[features].copy()
y = df['biopsed']

# STEP 3: Train–Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 4: Train the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# STEP 5: Evaluate the Model on the Hold-Out Test Set
y_pred = model.predict(X_test)

# Accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Precision & Recall
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
print("Precision:", precision)
print("Recall:", recall)

# Also run cross-validation on the entire dataset:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print("Cross-Validation Mean Accuracy:", cv_scores.mean())

# STEP 6: Feature Importance (Odds Ratios)
odds_ratios = np.exp(model.coef_[0])
print("\nFeature Importance (Odds Ratios):")
for feat, ratio in zip(features, odds_ratios):
    print(f"{feat}: {ratio:.2f}")

# ---------------------------
# Additional analysis (optional): Check missingness in the original DataFrame
print("\nMissing Value Counts in Original Data:")
print(df.isnull().sum())


Test Accuracy: 0.8043478260869565
Precision: 0.9137931034482759
Recall: 0.75177304964539
Cross-Validation Mean Accuracy: 0.7932982854977741

Feature Importance (Odds Ratios):
changed: 27.76
hurt: 6.97
bleed: 5.76
elevation: 3.18

Missing Value Counts in Original Data:
patient_id               0
lesion_id                0
smoke                  804
drink                  804
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1             804
diameter_2             804
diagnostic               0
itch                     0
grew                     0
hurt                     0
changed                  0
bleed                    0
elevation                0
img_id                   0
biopsed                  0
dtype: int64


  df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})


### GROUPKFOLD

In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load dataset
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

### **DATA PREPROCESSING** ###
binary_cols = ['changed', 'hurt', 'bleed', 'elevation']
numeric_cols = ['diameter_1', 'diameter_2', 'age']

# Convert categorical columns to numeric (True/False → 1/0, 'UNK' → NaN)
df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float)

# Handle missing values:
df[binary_cols] = df[binary_cols].fillna(0.5)  # Neutral value for missing binary features
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))  # Use median for numeric features

# Convert target variable 'biopsed' to numeric (1 = True, 0 = False)
df['biopsed'] = df['biopsed'].astype(int)

# Select features for the model
X = df[binary_cols + numeric_cols].copy()
y = df['biopsed']
groups = df['patient_id']  # Patient IDs for grouping

# Standardize numerical features (KNN is distance-based and sensitive to scale)
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

### **Ensure Patients Stay in the Same Fold**
# Initialize GroupKFold with 5 splits
gkf = GroupKFold(n_splits=5)
accuracy_scores = []
precision_scores = []
recall_scores = []

# Perform cross-validation with grouped patients
for i, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups)):  # FIX: Correct grouping
    # Get train/test patients
    train_patients = groups.iloc[train_idx].unique()
    test_patients = groups.iloc[test_idx].unique()

    print(f"\nFold {i + 1}")
    print(f"Train Patients ({len(train_patients)}): {train_patients}")
    print(f"Test Patients ({len(test_patients)}): {test_patients}")

    # Check if any patient appears in both sets
    repeated_patients = set(train_patients) & set(test_patients)
    if repeated_patients:
        print(f"WARNING: These patients appear in both training and testing: {repeated_patients}")

    # Filter dataset so all rows belonging to the selected patients stay in the same fold
    train_mask = df['patient_id'].isin(train_patients)
    test_mask = df['patient_id'].isin(test_patients)
    
    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    # Train KNN classifier
    knn = KNeighborsClassifier(n_neighbors=5)  # Adjust K as needed
    knn.fit(X_train, y_train)

    # Predict on test set
    y_pred = knn.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred, zero_division=1)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)

    print(f"Fold Accuracy: {accuracy:.4f}, Fold Precision: {precision:.4f}, Fold Recall: {recall:.4f}")

# Print final average scores across folds
print(f"\nAverage Accuracy across folds: {sum(accuracy_scores) / len(accuracy_scores):.4f}")
print(f"Average Precision across folds: {sum(precision_scores) / len(precision_scores):.4f}")
print(f"Average Recall across folds: {sum(recall_scores) / len(recall_scores):.4f}")


  df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float)



Fold 1
Train Patients (1098): ['PAT_1516' 'PAT_1545' 'PAT_1989' ... 'PAT_273' 'PAT_491' 'PAT_1714']
Test Patients (275): ['PAT_46' 'PAT_778' 'PAT_1995' 'PAT_705' 'PAT_2140' 'PAT_1653' 'PAT_134'
 'PAT_1453' 'PAT_1803' 'PAT_682' 'PAT_680' 'PAT_53' 'PAT_26' 'PAT_544'
 'PAT_2025' 'PAT_1094' 'PAT_369' 'PAT_981' 'PAT_409' 'PAT_1573' 'PAT_1109'
 'PAT_1400' 'PAT_549' 'PAT_847' 'PAT_999' 'PAT_2154' 'PAT_207' 'PAT_514'
 'PAT_1286' 'PAT_330' 'PAT_1062' 'PAT_2076' 'PAT_280' 'PAT_1411' 'PAT_34'
 'PAT_1702' 'PAT_398' 'PAT_953' 'PAT_478' 'PAT_663' 'PAT_1509' 'PAT_782'
 'PAT_257' 'PAT_397' 'PAT_185' 'PAT_1333' 'PAT_1027' 'PAT_375' 'PAT_495'
 'PAT_587' 'PAT_364' 'PAT_39' 'PAT_59' 'PAT_338' 'PAT_1324' 'PAT_672'
 'PAT_832' 'PAT_1364' 'PAT_192' 'PAT_831' 'PAT_291' 'PAT_346' 'PAT_372'
 'PAT_462' 'PAT_513' 'PAT_1031' 'PAT_1729' 'PAT_510' 'PAT_1875' 'PAT_806'
 'PAT_307' 'PAT_1042' 'PAT_1141' 'PAT_1476' 'PAT_766' 'PAT_1639'
 'PAT_1329' 'PAT_652' 'PAT_2061' 'PAT_1374' 'PAT_235' 'PAT_490' 'PAT_171'
 'PAT_1350'