In [None]:
!pip install /kaggle/input/pyradd/pyradiomics/*.whl
!yes | dpkg -i --force-depends /kaggle/input/pyvips-python-and-deb-package/linux_packages/archives/*.deb
!pip install pyvips -f /kaggle/input/pyvips-python-and-deb-package/python_packages/ --no-index
!pip list | grep pyvips
!cp -r /kaggle/input/yyyyyyyyyyyy/pyradiomics-3.0.1 /kaggle/working
!pip install --no-index --find-links /kaggle/working/pyradiomics-3.0.1/ -r requirements.txt
%cd /kaggle/working/pyradiomics-3.0.1
!python setup.py build_ext --inplace
import pyvips
from radiomics import featureextractor
from radiomics import imageoperations

In [None]:
import os
import gc
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras.layers as l
from PIL import Image
from tqdm.auto import tqdm
import tifffile

In [None]:
# Disable the decompression bomb limit
Image.MAX_IMAGE_PIXELS = None

In [None]:
df = pd.read_csv("/kaggle/input/mayo-clinic-strip-ai/test.csv")
# df = pd.read_csv("/kaggle/input/mayo-clinic-strip-ai/other.csv")

In [None]:
tooBig = []

In [None]:
def get_files_by_size(folder_path):
  """
  This function gets a list of all files in a folder sorted by size.

  Args:
      folder_path: The path to the folder to search.

  Returns:
      A list of tuples containing (file_path, file_size) sorted by size in descending order.
  """
  files_with_size = []
  for root, _, files in os.walk(folder_path):
    for file in files:
      file_path = os.path.join(root, file)
      try:
        file_size = os.path.getsize(file_path)
        files_with_size.append((file_path, file_size))
      except OSError:
        # Handle potential errors like permission issues
        pass
  # Sort the list by size in descending order using lambda and sorted
  files_with_size.sort(key=lambda x: x[1], reverse=True)
  return files_with_size

# Example usage
folder_path = "/kaggle/input/mayo-clinic-strip-ai/test"
sorted_files = get_files_by_size(folder_path)

In [None]:
sorted_files = np.array(sorted_files)

In [None]:
def tile(img, sz=128, N=16):
    shape = img.shape
    pad0,pad1 = (sz - shape[0]%sz)%sz, (sz - shape[1]%sz)%sz
    img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],constant_values=255)
    img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
    img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    if len(img) < N:
        img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N] # pick up Top N dark tiles
    img = img[idxs]
    return img

def save_dataset(
    df: np.array,
    N=16,
    max_size=20000, 
    crop_size=1024, 
    image_dir='../input/mayo-clinic-strip-ai/train', 
    out_dir='./train_data',
):
    format_to_dtype = {
       'uchar': np.uint8,
       'char': np.int8,
       'ushort': np.uint16,
       'short': np.int16,
       'uint': np.uint32,
       'int': np.int32,
       'float': np.float32,
       'double': np.float64,
       'complex': np.complex64,
       'dpcomplex': np.complex128,
    }
    
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
        
    tk0 = tqdm(enumerate(df), total=len(df))
    for i, image_path in tk0:
        image_id = '.'.join('/'.join(image_path.split('/')[-1:]).split('.')[:-1])
        print(f"[{i+1}/{len(df)}] image_id: {image_id}")
        image = pyvips.Image.thumbnail(image_path, max_size, height=max_size)
        image = np.array(image)
        width, height, c = image.shape
        image = tile(image, sz=crop_size, N=N)
        for idx, img in enumerate(image):
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            cv2.imwrite(f"{out_dir}/{image_id}_{idx}.jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 100])

        del img, image; gc.collect()

In [None]:
save_dataset(
    sorted_files[:,0],
    N=16, 
    max_size=20000,
    crop_size=1024, 
    image_dir="/kaggle/input/mayo-clinic-strip-ai/test", 
    out_dir=f'/kaggle/working/test'
)
# save_dataset(
#     df,
#     N=16, 
#     max_size=20000,
#     crop_size=1024, 
#     image_dir="/kaggle/input/mayo-clinic-strip-ai/other", 
#     out_dir=f'/kaggle/working/test'
# )

In [None]:
from sklearn.model_selection import train_test_split
import sys
import zipfile
from IPython.display import FileLink
from pathlib import Path
import albumentations as A
import SimpleITK as sitk
import csv
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
from IPython.display import Image, display
import matplotlib.cm as cm
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
train_data_df = pd.read_csv("/kaggle/input/pyradiomcs-features/pyradiomics_features.csv")

In [None]:
train_data_df = train_data_df.sample(frac=1, random_state=69)

In [None]:
# for column in train_data_df.columns:
#     train_data_df[column] = train_data_df[column].fillna(0)
    
train_data_df = train_data_df.replace(np.nan, 0)

In [None]:
train_data_df = train_data_df.reset_index(drop=True)

In [None]:
result_df = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/train.csv')

In [None]:
def _weighted_mc_log_loss(y_true, y_pred, epsilon=1e-15):
    class_cnt = [sum(int(val == cl) for val in y_true) for cl in range(2)]
    w = [0.5 for _ in range(2)]
    return -sum(
        w[cl] * sum(
            (y == cl) / class_cnt[cl] * np.log(max(min(y_hat, 1 - epsilon), epsilon))
            for y, y_hat in zip(y_true, y_pred[:, cl])
        )
        for cl in range(2)
    ) / sum(w[cl] for cl in range(2))

In [None]:
# Assuming train_data_df is your DataFrame containing the data
# Replace 'Label' with the actual column name if it's different


# Initial counts
num_CE = len(train_data_df[train_data_df['Label'] == 0])
num_LAA = len(train_data_df[train_data_df['Label'] == 1])

# Calculate the difference in counts between the two classes
count_diff = num_CE - num_LAA

# If there are more records with 'CE' label, randomly sample records with 'CE' to match the count of 'LAA'
if count_diff > 0:
    train_data_df = train_data_df.drop(train_data_df[train_data_df['Label'] == 0].sample(count_diff).index)
# If there are more records with 'LAA' label, randomly sample records with 'LAA' to match the count of 'CE'
elif count_diff < 0:
    train_data_df = train_data_df.drop(train_data_df[train_data_df['Label'] == 1].sample(-count_diff).index)

# Write the modified DataFrame to a new CSV file
train_data_df.to_csv("modified_train_data.csv", index=False)

print("Number of records with 'CE' and 'LAA' labels are now equal.")
print("New size of train_data_df:", len(train_data_df))



In [None]:
#train_df, test_df = train_test_split(result_df, test_size=0.2, random_state=42, stratify = result_df['label'])

In [None]:
"""# Function to extract prefix
def extract_prefix(image_id):
    return '_'.join(image_id.split('_')[:-1])


# Initialize DataFrames to store the results
train_big_df = pd.DataFrame()
test_big_df = pd.DataFrame()

# Extract the 'Image_ID' column
image_id_column = train_data_df['Image_ID']

# Remove non-numeric data from all columns except 'Image_ID'
for col in train_data_df.columns:
    if col != 'Image_ID':
        train_data_df[col] = pd.to_numeric(train_data_df[col], errors='coerce')

# Add back the 'Image_ID' column
train_data_df['Image_ID'] = image_id_column

# Split the data into train and test based on the prefix of 'Image_ID'
train_mask = train_data_df['Image_ID'].apply(lambda x: extract_prefix(x) in train_df['image_id'].values)
train_big_chunk_df = train_data_df[train_mask]
test_big_chunk_df = train_data_df[~train_mask]

# Concatenate DataFrames for the current chunk with the main DataFrames
train_big_df = pd.concat([train_big_df, train_big_chunk_df], ignore_index=True)
test_big_df = pd.concat([test_big_df, test_big_chunk_df], ignore_index=True)

# Optional: Drop duplicate rows from the resulting DataFrames
train_big_df = train_big_df.drop_duplicates()
test_big_df = test_big_df.drop_duplicates()

# Optional: Reset the index of the resulting DataFrames
train_big_df.reset_index(drop=True, inplace=True)
test_big_df.reset_index(drop=True, inplace=True)"""

**Run this in case you choose some features only**

In [None]:
# Assuming train_big_df is your DataFrame containing the data
train_big_df = train_data_df
features = train_big_df.iloc[:, 2:]

# Only consider numeric columns
numeric_cols = features.columns[features.apply(lambda x: pd.to_numeric(x, errors='coerce').notnull().all())]
features = features[numeric_cols]

# Assuming 'Label' is the target column
target_column = 'Label'
class_labels = train_big_df[target_column].unique()

u_test_results = {}

for feature in features.columns:
    feature_results = {}
    for i in range(len(class_labels)):
        for j in range(i+1, len(class_labels)):
            group1 = train_big_df[train_big_df[target_column] == class_labels[i]][feature]
            group2 = train_big_df[train_big_df[target_column] == class_labels[j]][feature]
            stat, _ = mannwhitneyu(group1, group2)  # We don't need p-value here
            pair_label = f"{class_labels[i]} vs {class_labels[j]}"
            feature_results[pair_label] = {'statistic': stat}
    u_test_results[feature] = feature_results

u_test_results_df = pd.DataFrame(u_test_results)

# Selecting features directly without considering P-values
selected_features_list = u_test_results_df.columns.tolist()

# Add the target column to the selected features list
selected_features_list = ['Label'] + selected_features_list

# Filter the DataFrame with selected features
train_big_df = train_big_df[selected_features_list]
#test_big_df = test_big_df[selected_features_list]

In [None]:
def calculate_p_value(feature, target, data):
    group1 = data[data[target] == 0][feature]  # Group 1: label 0
    group2 = data[data[target] == 1][feature]  # Group 2: label 1
    stat, p_value = mannwhitneyu(group1, group2)
    return p_value

# Target variable
target_variable = 'Label'

# Threshold for significance
threshold = 0.07

# Dictionary to store p-values for each feature
p_values = {}

# Iterate over each feature in the DataFrame
for idx, feature in enumerate(train_big_df.columns):
    if idx >= 2:  # Skip the first two columns
        if feature != target_variable:
            # Calculate p-value for the current feature
            p_value = calculate_p_value(feature, target_variable, train_big_df)
            p_values[feature] = p_value

# Filter significant features based on the threshold
significant_features = [feature for feature, p_value in p_values.items() if p_value <= threshold]

# Print significant features and their p-values
# for feature in significant_features:
#     print(f"Feature: {feature}, p-value: {p_values[feature]}")

In [None]:
train_red = train_big_df.drop(columns=[col for col in train_big_df.columns if col not in significant_features and col not in ["Image_ID", "Label"]])

train_red

In [None]:
#test_red = test_big_df.drop(columns=[col for col in test_big_df.columns if col not in significant_features and col not in ["Image_ID", "Label"]])

#test_red

In [None]:
y_train = train_red['Label']
X_train = train_red.drop('Label', axis=1)

In [None]:
#y_test = test_red['Label']
#X_test = test_red.drop('Label', axis=1)

XGBOOST

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
XGB = GradientBoostingClassifier(n_estimators=200, loss='log_loss', max_depth=4, ccp_alpha=0.0007).fit(X_train, y_train)
print(XGB.score(X_train, y_train))
print(classification_report(y_train, XGB.predict(X_train), target_names=['CE', 'LAA']))
#print(XGB.score(X_test, y_test))
#print(classification_report(y_test, XGB.predict(X_test),  target_names=['CE', 'LAA']))

In [None]:
_weighted_mc_log_loss(y_train, XGB.predict_proba(X_train))

# Pred

In [None]:
def parse_images(folder):
    # get the full paths of the images
    imgs = [os.path.join(folder, f) for f in os.listdir(folder)]

    df = pd.DataFrame()
    df['image_path'] = imgs
    # remove extension, and have the id as first and last component, eg: 006388_0
    df['image_id'] = df['image_path'].apply(lambda x: '_'.join(x.split('/')[-1].replace('.jpg', '').split('_')[:2]))
    # remove extension, and have the instance_id as last component only, eg: 0, 1, ...
    df['instance_id'] = df['image_path'].apply(lambda x: int(x.split('_')[-1].replace('.jpg', '')))

    df = df.sort_values(['image_id', 'instance_id']).reset_index(drop=True)

    return df

def merge_image_info(image_df, info_df):
    return image_df.merge(info_df, on='image_id', how='left').reset_index(drop=True)

In [None]:
test_csv = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/test.csv')
# test_csv = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/other.csv')

In [None]:
df = merge_image_info(parse_images('/kaggle/working/test'), test_csv)

In [None]:
df

In [None]:
# Initialize PyRadiomics feature extractor
extractor = featureextractor.RadiomicsFeatureExtractor()

# Define your lower and upper threshold values for yellow and brown
lower_yellow = np.array([20, 100, 100])
upper_yellow = np.array([30, 255, 255])
lower_orange = np.array([5, 100, 100]) 
upper_orange = np.array([20, 255, 255])
lower_brown = np.array([20, 50, 20]) 
upper_brown = np.array([40, 255, 200]) 


# Define the HSV color range for brown
lower_brown = np.array([10, 50, 20])  
upper_brown = np.array([40, 255, 255])  


lower_blue = np.array([90, 50, 50])
lower_purple = np.array([120, 50, 50])
upper_purple = np.array([150, 255, 255])
lower_dark_purple = np.array([100, 50, 50])  
upper_dark_purple = np.array([120, 255, 255]) 
l_blu = np.array([0, 41, 38]) 
u_blu = np.array([36, 100, 100]) 
l_blu2 = np.array([0, 0, 0]) 
u_blu2 = np.array([36, 100, 10]) 
l_blu3 = np.array([300, 50, 50]) 
u_blu3 = np.array([320, 100, 100]) 
l_blu4 = np.array([240, 50, 50]) 
u_blu4 = np.array([180, 100, 100]) 
l_blu5 = np.array([67, 50, 50]) 
u_blu5 = np.array([340, 100, 255]) 
# Define lower and upper threshold values for white in HSV
lower_white = np.array([0, 0, 200])
upper_white = np.array([179, 30, 255])

# Initialize list to store PyRadiomics data for all tiles of all images
pyradiomics_data = []


for i in range(len(df)):  
    stained_image = cv2.imread(df["image_path"][i])
    #stained_image = np.array(tile)
    hsv_image = cv2.cvtColor(stained_image, cv2.COLOR_BGR2HSV)

    # RBC
    orange_mask = cv2.inRange(hsv_image, lower_orange, upper_orange)
    yellow_mask = cv2.inRange(hsv_image, lower_yellow, upper_yellow)
    brown_mask = cv2.inRange(hsv_image, lower_brown, upper_brown)
    yellow_brown_mask = cv2.bitwise_or(yellow_mask, brown_mask)
    yellow_brown_mask = cv2.bitwise_or(yellow_brown_mask, orange_mask)

    ########
    rbc_mask = yellow_brown_mask.copy()
    rbc_mask[rbc_mask > 0] = 1

    # WBC

    #lower_dark_purple = np.array([100, 50, 50])  
    #upper_dark_purple = np.array([120, 255, 255])  
    dark_purple_mask = cv2.inRange(hsv_image, lower_dark_purple, upper_dark_purple)

    blue_mask   = cv2.inRange(hsv_image, lower_blue, upper_purple)
    purple_mask = cv2.inRange(hsv_image, lower_purple, upper_purple)
    blue_purple_mask = cv2.bitwise_or(blue_mask, purple_mask)
    purple_mask = cv2.bitwise_or(blue_purple_mask, dark_purple_mask)
    pur_mask = cv2.inRange(hsv_image, l_blu5, u_blu5)
    blue_purple_mask = cv2.bitwise_or(purple_mask, pur_mask)
    
    wbc_mask = blue_purple_mask.copy()
    wbc_mask[wbc_mask > 0] = 1


            # Fibrin/Platelets
    white_mask = cv2.inRange(hsv_image, lower_white, upper_white)
    fibrin_platelets_mask = cv2.bitwise_and(cv2.bitwise_not(yellow_brown_mask), cv2.bitwise_not(blue_purple_mask))
    fibrin_platelets_mask = cv2.bitwise_and(fibrin_platelets_mask, cv2.bitwise_not(white_mask))
    fibrin_platelets_mask[fibrin_platelets_mask > 0] = 1  # Ensure all non-zero values are set to 1


    RBC = cv2.bitwise_and(stained_image, stained_image, mask = yellow_brown_mask)
    WBC = cv2.bitwise_and(stained_image, stained_image, mask = blue_purple_mask)
    FP  = cv2.bitwise_and(stained_image, stained_image, mask = fibrin_platelets_mask)

    try:
        # Read original image
        original_image_sitk = sitk.GetImageFromArray(cv2.cvtColor(stained_image, cv2.COLOR_RGB2GRAY))

        # Convert mask images to SimpleITK format
        maskRBC_image_sitk = sitk.GetImageFromArray(rbc_mask)
        maskWBC_image_sitk = sitk.GetImageFromArray(wbc_mask)
        maskFP_image_sitk = sitk.GetImageFromArray(fibrin_platelets_mask)

        # Extract features using PyRadiomics
        featuresRBC = extractor.execute(original_image_sitk, maskRBC_image_sitk)
        featuresWBC = extractor.execute(original_image_sitk, maskWBC_image_sitk)
        featuresFP = extractor.execute(original_image_sitk, maskFP_image_sitk)

        # Append data to list
        pyradiomics_data.append({
            'Image_ID': f"{df['image_id'][i]}_{df['instance_id'][i]}",
            **{f"RBC_{k}": v for k, v in featuresRBC.items()},
            **{f"WBC_{k}": v for k, v in featuresWBC.items()},
            **{f"FP_{k}": v for k, v in featuresFP.items()}
        })
    except Exception as e:
        print(f"{i})Error extracting features for tile {df['instance_id'][i]} of image {df['image_id'][i]}: {e}")
        continue

# Convert list of dictionaries to DataFrame
pyradiomics_df = pd.DataFrame(pyradiomics_data)
pyradiomics_df.to_csv('/kaggle/working/pyrad.csv')
print("All PyRadiomics features saved successfully.")

In [None]:
pyradiomics_df = pd.read_csv('/kaggle/working/pyrad.csv')

In [None]:
pyradiomics_df = pyradiomics_df.drop(columns=[col for col in pyradiomics_df.columns if col not in significant_features and col not in ["Image_ID", "Label"]])

In [None]:
ids = pyradiomics_df['Image_ID']
X_test = pyradiomics_df.drop('Image_ID', axis=1)

In [None]:
def extract_prefix(image_id):
    return '_'.join(image_id.split('_')[:-2])

# for i in range(len(ids)):
#     ids.iloc[i] = extract_prefix(ids.iloc[i])

ids = ids.apply(extract_prefix)


In [None]:
X_test = X_test.replace(np.nan, 0)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
preds = np.array(XGB.predict_proba(X_test))

In [None]:
ids = ids.to_numpy()

In [None]:
preds = pd.DataFrame(np.concatenate((ids.reshape(-1,1), preds), axis=1), columns = ['patient_id', 'CE', 'LAA'])

In [None]:
preds = preds.groupby('patient_id').mean()

In [None]:
mce = preds['CE'].mean()
mlaa = preds['LAA'].mean()

In [None]:
for img in tooBig:
    preds.loc['_'.join(img.split('_')[:-1])] = [mce, mlaa]

In [None]:
# Fill NaN values with [0.5, 0.5] for 'CE' and 'LAA' columns
preds.fillna({'CE': mce, 'LAA': mlaa}, inplace=True)

In [None]:
invalid_values_mask = (preds['CE'] > 1) | (preds['CE'] < 0) | (preds['LAA'] > 1) | (preds['LAA'] < 0)
preds.loc[invalid_values_mask, ['CE', 'LAA']] = [mce, mlaa]

In [None]:
folder_path = '/kaggle/input/mayo-clinic-strip-ai/test'
files = os.listdir(folder_path)
file_names = [os.path.splitext(file)[0] for file in files]
modified_list = ['_'.join(item.split('_')[:-1]) for item in file_names]

In [None]:
pred_ids = preds.index
for patient_id in modified_list:
    # Check if patient_id is not in the DataFrame index
    if patient_id  not in pred_ids:
        preds.loc[patient_id] = [mce, mlaa]
        #print("0")

In [None]:
preds = preds.reset_index()
preds = preds.sort_values(by='patient_id')

In [None]:
preds = preds.drop_duplicates(subset='patient_id', keep='first')

In [None]:
preds = preds.reset_index(drop=True)

In [None]:
preds.to_csv('/kaggle/working/submission.csv',index=False)