In [42]:
import numpy as np
import os
import pandas as pd

In [44]:
og_metadata = pd.read_csv(
    'metadata_image_generation_picai.csv',
    delimiter=',',
    quotechar='"',
    on_bad_lines='skip',
    engine='python',
    header=None,
    names=["file_name", "text"]
)

# Identify rows where 'file_name' contains a comma and split them
problematic_rows = og_metadata['file_name'].str.contains(r'.+,.+', na=False)
split_rows = og_metadata.loc[problematic_rows, 'file_name'].str.split(',', n=1, expand=True)
og_metadata.loc[problematic_rows, 'file_name'] = split_rows[0]
og_metadata.loc[problematic_rows, 'text'] = split_rows[1]

In [46]:
og_metadata = og_metadata.drop(0).reset_index(drop=True)

In [49]:
pd.set_option('display.max_colwidth', None)
og_metadata

Unnamed: 0,file_name,text
0,0.png,"Prostate MRI t2w, with medium tumor"
1,1.png,"Prostate MRI t2w, with medium tumor"
2,2.png,"Prostate MRI t2w, with medium tumor"
3,3.png,"Prostate MRI t2w, healthy"
4,4.png,"Prostate MRI t2w, with large tumor"
...,...,...
2095,2095.png,"Prostate MRI t2w, with medium tumor"
2096,2096.png,"Prostate MRI t2w, healthy"
2097,2097.png,"Prostate MRI t2w, healthy"
2098,2098.png,"Prostate MRI t2w, with medium tumor"


In [36]:
og_metadata['text'] = og_metadata['text'].str.replace('t1ce', 'flair')

In [37]:
og_metadata['text'] = og_metadata['text'].str.replace('flair healthy', 'flair, healthy')

In [48]:
og_metadata['text'] = og_metadata['text'].str.strip('"')

In [50]:
# Count rows with 'healthy' and 'tumor'
healthy_rows = og_metadata[og_metadata['text'].str.contains('healthy')]
tumor_rows = og_metadata[og_metadata['text'].str.contains('tumor')]

# Keep only 500 rows of each
healthy_rows = healthy_rows.head(500)
tumor_rows = tumor_rows.head(500)

# Combine the rows
filtered_metadata = pd.concat([healthy_rows, tumor_rows]).reset_index(drop=True)
filtered_metadata = filtered_metadata.sample(frac=1).reset_index(drop=True)

# Rename the file_name field
filtered_metadata['file_name'] = [f"{i}.png" for i in range(len(filtered_metadata))]

filtered_metadata

Unnamed: 0,file_name,text
0,0.png,"Prostate MRI t2w, with medium tumor"
1,1.png,"Prostate MRI t2w, with small tumor"
2,2.png,"Prostate MRI t2w, healthy"
3,3.png,"Prostate MRI t2w, with medium tumor"
4,4.png,"Prostate MRI t2w, healthy"
...,...,...
995,995.png,"Prostate MRI t2w, with medium tumor"
996,996.png,"Prostate MRI t2w, healthy"
997,997.png,"Prostate MRI t2w, with medium tumor"
998,998.png,"Prostate MRI t2w, with medium tumor"


In [51]:
filtered_metadata.to_csv('metadata_generation_picai.csv', index=False)

In [42]:
new_metadata = pd.DataFrame(columns=og_metadata.columns)
for i, prompt in enumerate(og_metadata['Prompt']):
    if i==0:
        continue
    
    if 'tumor' in prompt:
        new_prompt = prompt.split('tumor')[0] + 'tumor"' # remove the location information (in prostate it's basically always the same in the middle)
    elif 'healthy' in prompt:
        new_prompt = prompt.replace(' healthy', ', healthy') # minor fix
        new_prompt = '"' + new_prompt + '"' # minor fix
    else:
        new_prompt = prompt
        
    # Change brain with prostate, and t1ce with t2w
    new_prompt = new_prompt.replace('Brain', 'Prostate')
    new_prompt = new_prompt.replace('t1ce', 't2w')
    
    # Remove quoation marks
    new_prompt = new_prompt.replace('"', '')
    
    new_row = pd.DataFrame({'file_name': [og_metadata['file_name'][i]], 'Prompt': [new_prompt]})
    new_metadata = pd.concat([new_metadata, new_row], ignore_index=True)

In [43]:
new_metadata

Unnamed: 0,file_name,Prompt
0,0.png,"Prostate MRI t2w, with medium tumor"
1,1.png,"Prostate MRI t2w, with medium tumor"
2,2.png,"Prostate MRI t2w, with medium tumor"
3,3.png,"Prostate MRI t2w, healthy"
4,4.png,"Prostate MRI t2w, with large tumor"
...,...,...
2095,2095.png,"Prostate MRI t2w, with medium tumor"
2096,2096.png,"Prostate MRI t2w, healthy"
2097,2097.png,"Prostate MRI t2w, healthy"
2098,2098.png,"Prostate MRI t2w, with medium tumor"


In [44]:
new_metadata.to_csv('metadata_image_generation_picai.csv', index=False)