In [8]:
import pandas as pd
import numpy as np

In [9]:
df_src = pd.read_csv('sorted_training_dataset_with_labels.csv')
prefix = 'src'
middle = 'test'
suffix = 4
dir = f'{prefix}_{middle}_{suffix}.csv'
df_dest = pd.read_csv(dir)

column_list = ['any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']

# Criteria columns:
# Src: filename, study_instance_uid
# Dest: instance_name, bag_name

# Copy the criteria columns from the source to the destination if the criteria columns are the same
# Rename columns in df_src to match df_dest
df_src = df_src.rename(columns={'filename': 'instance_name', 'study_instance_uid': 'bag_name'})

# Merge the DataFrames based on the criteria columns
df_merged = pd.merge(df_dest, df_src[['instance_name', 'bag_name'] + column_list],
                     on=['instance_name', 'bag_name'],
                     how='left')

# Update df_dest with the merged data
df_dest = df_merged

# Save the updated DataFrame to a new CSV file
df_dest.to_csv(f'{prefix}_{middle}_{suffix}_update.csv', index=False)

In [10]:
# column_list = ['any', 'extradural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
# src_column_list = ['patient_ICH', 'patient_EDH', 'patient_IPH', 'patient_IVH', 'patient_SAH', 'patient_SDH']
#
# df_src = 'cq500.csv'
# prefix = 'cq500'
# middle = 'test'
# suffix = 2
# dir = f'{prefix}_{middle}_{suffix}.csv'
# df_dest = pd.read_csv(dir)
# # Remove '-' in the bag_name column
# df_dest['bag_name'] = df_dest['bag_name'].str.replace('-', '')
#
# df_src = pd.read_csv(df_src)
# df_src = df_src.rename(columns={'name':'bag_name'})
#
# df_merged = pd.merge(df_dest, df_src[['bag_name'] + src_column_list],
#                         on=['bag_name'],
#                         how='left')
#
# df_dest = df_merged
# df_dest.to_csv(f'{prefix}_{middle}_{suffix}_update.csv', index=False)

In [11]:
df = df_dest
# Group by its bag_name columns and count the largest number of instances in each bag
df = df.groupby('bag_name').size().reset_index(name='count')
# Sort the DataFrame by the count column in descending order
df = df.sort_values('count', ascending=False)
# print the first 10 rows
print(df.head(10))

          bag_name  count
53   ID_fea37ba57c     57
118  ID_ff8e97229a     57
108  ID_ff6f2428a1     52
64   ID_fecc8e4431     50
106  ID_ff5d2ae4ee     48
36   ID_fe6badda38     48
147  ID_fffc71b58c     46
35   ID_fe6ad33d25     46
30   ID_fe597795f5     46
79   ID_fefaf4fb24     45


In [12]:
NUM_INSTANCES = 28
def downsample_bag(group):
    n_instances = len(group)

    # First remove small bags before processing
    if n_instances < 10:
        return pd.DataFrame()  # Return empty dataframe

    # Then handle downsampling
    if n_instances <= NUM_INSTANCES:
        return group

    step = n_instances / NUM_INSTANCES
    indices = np.round(np.arange(0, n_instances, step)).astype(int)
    unique_indices = pd.unique(np.clip(indices, 0, n_instances-1))

    return group.iloc[unique_indices]

# Apply with two-stage filtering
downsampled_df = (
    df_dest.groupby('bag_name', group_keys=False)
    .apply(downsample_bag)
    .reset_index(drop=True)
)
downsampled_df.to_csv(f'{prefix}_{middle}_{suffix}_redundancy.csv', index=False)

  .apply(downsample_bag)


In [13]:
df = downsampled_df
# Group by its bag_name columns and count the largest number of instances in each bag
df = df.groupby('bag_name').size().reset_index(name='count')
# Sort the DataFrame by the count column in descending order
df = df.sort_values('count', ascending=False)
# print the min and max count
print(f"Min count: {df['count'].min()}")
print(f"Max count: {df['count'].max()}")

Min count: 24
Max count: 28


In [14]:
# import os
# import shutil
# import pandas as pd
#
# def copy_files_from_df(df, filename_column, src_dir, dest_dir):
#     """
#     Copy files listed in a DataFrame from a source directory (including subfolders) to a destination directory.
#
#     Args:
#     df (pandas.DataFrame): DataFrame containing filenames.
#     filename_column (str): Name of the column in df that contains filenames.
#     src_dir (str): Path to the source directory containing the files (including subfolders).
#     dest_dir (str): Path to the destination directory where files will be copied.
#     """
#     # Create the destination directory if it doesn't exist
#     os.makedirs(dest_dir, exist_ok=True)
#
#     # Iterate through the filenames in the DataFrame
#     for filename in df[filename_column]:
#         # Search for the file in the source directory and its subfolders
#         for root, dirs, files in os.walk(src_dir):
#             if filename in files:
#                 src_path = os.path.join(root, filename)
#                 dest_path = os.path.join(dest_dir, filename)
#
#                 # Create subdirectories in the destination if needed
#                 os.makedirs(os.path.dirname(dest_path), exist_ok=True)
#
#                 # Copy the file
#                 shutil.copy2(src_path, dest_path)
#                 print(f"Copied: {src_path} -> {dest_path}")
#                 break
#         else:
#             print(f"File not found: {filename}")
#
# # Example usage
# df = pd.read_csv('testing_example.csv')
# src_dir = '../../src-ich-mil/'
# dest_dir = 'sa_test'
#
# # Assuming the column with filenames is called 'filename'
# copy_files_from_df(df, 'instance_name', src_dir, dest_dir)
