In [1]:
import pandas as pd
import re as re

In [7]:
# concatenate all data tables into one
df_uhw = pd.read_csv('data_tables/uhw_data.csv')
df_medway = pd.read_csv('data_tables/MFT_data.csv')
df_lus = pd.read_csv('data_tables/LUS_data.csv')
df_lus = df_lus.drop(columns=['Read Error'], errors='ignore')  # drop Read Error column if it exists
df_all = pd.concat([df_uhw, df_medway, df_lus], ignore_index=True)

# remove neglect column
if 'neglect' in df_all.columns:
    df_all = df_all.drop(columns=['neglect'])
    
# remove final column if it is unnamed
if 'Unnamed: 0' in df_all.columns:
    df_all = df_all.drop(columns=['Unnamed: 0'])


In [8]:
print(df_all.columns)

Index(['Hospital', 'File Path', 'Patient ID', 'Scan No', 'Total Frames', 'FPS',
       'Scan Label', 'Score'],
      dtype='object')


In [9]:
# flag duplicates

# hospital = df_all['Hospital']
# df_loaded = pd.read_csv(f"{hospital}_data_duplicate_scan_labels.csv")

# # check if video paths are in the duplicates dataframe and add a new column 'is_duplicate' with True/False
# df_all['is_duplicate'] = df_all['File Path'].isin(df_loaded['File Path'])

# df_preferences = pd.read_csv(f"data_tables/{hospital}_duplicate_video_preferences.csv")
# df_all['preferred_duplicate'] = df_all['File Path'].isin(df_preferences['preferred'])



In [10]:
# Initialize columns with False
df_all['is_duplicate'] = False
df_all['preferred_duplicate'] = False

# Process each hospital group separately
for hospital in df_all['Hospital'].unique():
    # Create a mask to select only rows for the current hospital
    hospital_mask = df_all['Hospital'] == hospital
    
    # make hospital lower case if UHW
    if hospital == 'UHW':
        hospital = hospital.lower()

    if hospital == 'JCUH':
        hospital = 'LUS'

    df_loaded = pd.read_csv(f"data_tables/{hospital}_data_duplicate_scan_labels.csv")
    # Check if paths are in the loaded dataframe
    is_dup = df_all.loc[hospital_mask, 'File Path'].isin(df_loaded['File Path'])
    # Assign values to the specific rows
    df_all.loc[hospital_mask, 'is_duplicate'] = is_dup
    
    df_preferences = pd.read_csv(f"data_tables/{hospital}_duplicate_video_preferences.csv")
    # select preferred duplicates using largest final number
    for index, row in df_preferences.iterrows():
        video_paths = row['all_videos'].strip("[]").replace("'", "").split(", ")
        max_num = -1
        preferred_video = None
        for vp in video_paths:
            # Search for any sequence of digits at the end of the filename before the extension
            match = re.search(r'(\d+)\.[^.]+$', vp)
            if match:
                num = int(match.group(1))
                if num > max_num:
                    max_num = num
                    preferred_video = vp
        
        if preferred_video:
            df_all.loc[df_all['File Path'] == preferred_video, 'preferred_duplicate'] = True




In [11]:
df_all['no_score']= False

for index, row in df_all.iterrows():
    if pd.isna(row['Score']):
        df_all.at[index, 'no_score'] = True


In [12]:
crop_coords_df = pd.read_csv('mft_video_crop_coordinates.csv')
df_loaded = pd.read_csv('data_tables/MFT_data.csv')

df_all['M-mode scan']=False
mft_mask = df_all['Hospital'] == 'MFT'
is_m_mode = ~df_all.loc[mft_mask, 'File Path'].isin(crop_coords_df['video_path'])
df_all.loc[is_m_mode.index[is_m_mode], 'M-mode scan'] = True



In [13]:
print(df_all)

     Hospital                                          File Path  Patient ID  \
0         UHW  /cosma7/data/dp004/rrtx34/ultrasound/UHW/Case-...           6   
1         UHW  /cosma7/data/dp004/rrtx34/ultrasound/UHW/Case-...           6   
2         UHW  /cosma7/data/dp004/rrtx34/ultrasound/UHW/Case-...           6   
3         UHW  /cosma7/data/dp004/rrtx34/ultrasound/UHW/Case-...           6   
4         UHW  /cosma7/data/dp004/rrtx34/ultrasound/UHW/Case-...           6   
...       ...                                                ...         ...   
3705     JCUH  /cosma7/data/dp004/rrtx34/ultrasound/JCUH/010/...          10   
3706     JCUH  /cosma7/data/dp004/rrtx34/ultrasound/JCUH/010/...          10   
3707     JCUH  /cosma7/data/dp004/rrtx34/ultrasound/JCUH/010/...          10   
3708     JCUH  /cosma7/data/dp004/rrtx34/ultrasound/JCUH/010/...          10   
3709     JCUH  /cosma7/data/dp004/rrtx34/ultrasound/JCUH/010/...          10   

     Scan No  Total Frames     FPS Scan

In [14]:
df_all.to_csv('data_tables/all_data.csv', index=False)

In [2]:
# change all_data to pandas dataframe
df = pd.read_csv('data_tables/all_data.csv')

#  if score is ni or na, set no_score to True
for index, row in df.iterrows():
    if row['Score'] in ['ni', 'na']:
        df.at[index, 'no_score'] = True

df.to_csv('data_tables/all_data.csv', index=False)

