In [None]:
#@title Define if we are on Colab and mount drive { display-mode: "form" }
run_params = {}
try:
  from google.colab import drive
  drive.mount('/content/gdrive')
  run_params['IN_COLAB'] = True
except:
  run_params['IN_COLAB'] = False

In [None]:
#@title (COLAB ONLY) Clone GitHub repo { display-mode: "form" }

if run_params['IN_COLAB']:
  !git clone https://github.com/lluissalord/radiology_ai.git

  %cd radiology_ai

In [None]:
#@title Setup environment and Colab general variables { display-mode: "form" }
%%capture
%run colab_pip_setup.ipynb

In [None]:
import os

In [None]:
from organize.data_files import *
from organize.relation import *
from organize.templates import *
from organize.utils import *

groups = ['Daniel_Berlanga', 'David_Codina', 'Federico_Yanez', 'Guillem_Navarro', 'Jose_Colombo', 'Raul_Franco', 'Hugo_Munoz', 'Tomas_Maio']
subgroup_length = 20
relation_filepath = run_params['PATH_PREFIX'] + 'relation.csv'
filename_prefix = 'IMG_'
copy = True

able_overwrite = False

run_params['NUM_SAMPLES_ORGANISE'] = 6 * subgroup_length * len(groups)

In [None]:
metadata_labels_path = os.path.join(run_params['PATH_PREFIX'], 'metadata_labels.csv')
if os.path.exists(metadata_labels_path):
    metadata_labels = pd.read_csv(metadata_labels_path, index_col=0)
else:
    metadata_labels = None

# Define which column to use as the prediction
if 'Final_pred' in metadata_labels.columns:
    pred_col = 'Final_pred'
else:
    pred_col = 'Pred'

# Set the manually labelled with Prob = 1
metadata_labels.loc[metadata_labels['Prob'].isnull(), 'Prob'] = 1

# Select the NUM_SAMPLES_ORGANISE files with highest Prob that are labelled as `ap` and metadata said so too
# metadata_labels_sent = metadata_labels[(metadata_labels['Label'] == 'ap') & (metadata_labels[pred_col] == 'ap')].sort_values('Prob', ascending=False).iloc[:run_params['NUM_SAMPLES_ORGANISE']]

In [None]:
# Read all the sources
metadata_save_path = run_params['PATH_PREFIX'] + 'metadata_raw.csv'
metadata_df = pd.read_csv(metadata_save_path)
df_all = pd.read_excel(os.path.join(run_params['PATH_PREFIX'], 'all.xlsx'), dtype={'ID':'string','Target':'string'}, engine='openpyxl')
relation_df = open_name_relation_file(relation_filepath, sep=',')

# Filter metadata to only sent images fulfiling condition
filter_metadata_df = metadata_df[
  (
      metadata_df.InstitutionName.str.lower().str.contains('coslada').astype(bool)
     | metadata_df.InstitutionName.str.lower().str.contains('cugat').astype(bool)
  ) 
  & (metadata_df.InstitutionName.notnull())
  | (metadata_df.AccessionNumber.astype('str').str.startswith('885'))
]

# Merge all the sources
df_1 = metadata_labels.copy()
df_1.index = pd.Series(metadata_labels.index, name='fname').apply(lambda x: Path(x).name[:-4])
df_2 = pd.DataFrame(index=filter_metadata_df.fname.apply(lambda x: Path(x).name))
df_2['check_condition'] = True
df_merge = df_1.merge(df_2, how='left', right_index=True, left_index=True)
relation_df.index = relation_df.Original_Filename
df_merge = df_merge.merge(relation_df, how='left', right_index=True, left_index=True)
df_merge['annotator'] = df_merge.apply(lambda x: x['Path'].split('/')[-2] if type(x['Path']) is str else x['Path'], axis=1)
df_merge['n_annotator'] = df_merge.groupby('ID')['annotator'].transform('count')
df_all['check_sent'] = True
# df_merge = df_merge.merge(df_all[~df_all['ID'].duplicated()], how='left', left_on='Filename', right_on='ID')
df_merge = df_merge.merge(df_all, how='left', left_on='Filename', right_on='ID')
df_merge['Incorrect_image'] = df_merge['Incorrect_image'].notnull()
df_merge['Not_enough_quality'] = df_merge['Not_enough_quality'].notnull()
df_merge.index = metadata_labels.index

# Resulting pivot table on current data
display(
    df_merge[df_merge['check_condition'] == True].pivot_table(
      # index=['Label', 'Raw_pred', 'Pred', pred_col],
      index=['Raw_pred'],
      values=['Prob', 'check_condition', 'check_sent', 'Target','Incorrect_image'],
      aggfunc={'Prob': 'mean', 'check_condition': 'sum', 'check_sent': 'sum', 'Target':['count',lambda x: (x != '0').sum(), lambda x: dict(x.value_counts()[x.value_counts().index != '0'])], 'Incorrect_image':'sum'})
)
# Condition to be sent
metadata_labels_sent = df_merge[(df_merge[pred_col].str.contains('ap') == True) & (df_merge['check_condition'] == True)].sort_values('Prob', ascending=False)#.iloc[:run_params['NUM_SAMPLES_ORGANISE']]

In [None]:
# Distribute equally the files on RAW_FOLDER which are contained on metadata_labels (labelled as `ap`)

relation_df, num_new_files = organize_folders(run_params['RAW_FOLDER'], run_params['ORGANIZE_FOLDER'], relation_filepath, reset=False, groups=groups, subgroup_length=subgroup_length, filename_prefix=filename_prefix, force_extension='.dcm', copy=copy, metadata_labels=metadata_labels_sent, debug=False)

In [None]:
from fastai.data.all import *

relation_df = open_name_relation_file(relation_filepath, sep=',')

# Modify DICOM metadata of all the new files to rename patient info as the DICOM filename
# dicom_files = get_files(ORGANIZE_FOLDER, extensions='.dcm')

# dicom_files = L(list(relation_df[-num_new_files:].apply(lambda row: Path(os.path.join(row.Path, row.Filename) + '.dcm'), axis=1).values))
# rename_patient(dicom_files)

last_block_id = 452
prefix = 'IMG_'

relations_to_rename_patient = relation_df[relation_df["Path"].str.split('/').str[-1].astype(int) > last_block_id]
dicom_files = L(list(relations_to_rename_patient.apply(lambda row: Path(os.path.join(row.Path, row.Filename) + '.dcm'), axis=1).values))
rename_patient(dicom_files)

In [None]:
# Generate all the missing templates on each folder

generate_template(run_params['ORGANIZE_FOLDER'], groups, subgroup_length, filename_prefix=filename_prefix, excel=True, able_overwrite=able_overwrite)

In [None]:
modify_template(run_params['ORGANIZE_FOLDER'], lambda df: df.drop('Side', axis=1), groups, subgroup_length, excel=True)

In [None]:
# move_blocks(run_params['ORGANIZE_FOLDER'], new_folder='Test_participant', blocks=[31,32,30,29], relation_filepath=relation_filepath, template_extension='xlsx')
move_distribute_blocks(run_params['ORGANIZE_FOLDER'], new_folders=['Test_participant', 'Test_participant_2'], blocks=[31,32,30,29], relation_filepath=relation_filepath, template_extension='xlsx')

In [None]:
# Open relation file where the move/copy will be based on
relation_df = open_name_relation_file(relation_filepath, sep=',')

move_relation(relation_filepath, copy=True, to_raw=False)