In [1]:
import numpy as np
import os
import pandas as pd

In [17]:
og_metadata = pd.read_csv(
    'metadata_image_generation.csv',
    delimiter=',',
    quotechar='"',
    on_bad_lines='skip',
    engine='python',
    header=None,
    names=["file_name", "Prompt"]
)

# Identify rows where 'file_name' contains a comma and split them
problematic_rows = og_metadata['file_name'].str.contains(r'.+,.+', na=False)
split_rows = og_metadata.loc[problematic_rows, 'file_name'].str.split(',', n=1, expand=True)
og_metadata.loc[problematic_rows, 'file_name'] = split_rows[0]
og_metadata.loc[problematic_rows, 'Prompt'] = split_rows[1]

In [29]:
pd.set_option('display.max_colwidth', None)
og_metadata

Unnamed: 0,file_name,Prompt
0,file_name,Prompt
1,0.png,"""Brain MRI t1ce, with medium tumor on the top right"""
2,1.png,"""Brain MRI t1ce, with medium tumor on the top right"""
3,2.png,"""Brain MRI t1ce, with medium tumor on the top right"""
4,3.png,Brain MRI t1ce healthy
...,...,...
2096,2095.png,"""Brain MRI t1ce, with medium tumor on the bottom left"""
2097,2096.png,Brain MRI t1ce healthy
2098,2097.png,Brain MRI t1ce healthy
2099,2098.png,"""Brain MRI t1ce, with medium tumor on the top right"""


In [42]:
new_metadata = pd.DataFrame(columns=og_metadata.columns)
for i, prompt in enumerate(og_metadata['Prompt']):
    if i==0:
        continue
    
    if 'tumor' in prompt:
        new_prompt = prompt.split('tumor')[0] + 'tumor"' # remove the location information (in prostate it's basically always the same in the middle)
    elif 'healthy' in prompt:
        new_prompt = prompt.replace(' healthy', ', healthy') # minor fix
        new_prompt = '"' + new_prompt + '"' # minor fix
    else:
        new_prompt = prompt
        
    # Change brain with prostate, and t1ce with t2w
    new_prompt = new_prompt.replace('Brain', 'Prostate')
    new_prompt = new_prompt.replace('t1ce', 't2w')
    
    # Remove quoation marks
    new_prompt = new_prompt.replace('"', '')
    
    new_row = pd.DataFrame({'file_name': [og_metadata['file_name'][i]], 'Prompt': [new_prompt]})
    new_metadata = pd.concat([new_metadata, new_row], ignore_index=True)

In [43]:
new_metadata

Unnamed: 0,file_name,Prompt
0,0.png,"Prostate MRI t2w, with medium tumor"
1,1.png,"Prostate MRI t2w, with medium tumor"
2,2.png,"Prostate MRI t2w, with medium tumor"
3,3.png,"Prostate MRI t2w, healthy"
4,4.png,"Prostate MRI t2w, with large tumor"
...,...,...
2095,2095.png,"Prostate MRI t2w, with medium tumor"
2096,2096.png,"Prostate MRI t2w, healthy"
2097,2097.png,"Prostate MRI t2w, healthy"
2098,2098.png,"Prostate MRI t2w, with medium tumor"


In [44]:
new_metadata.to_csv('metadata_image_generation_picai.csv', index=False)