In [1]:
import pandas as pd
import re

In [2]:
# let's load the datafram
df = pd.read_csv('synthetic_gt.csv')

In [3]:
df.filename

0       id13_v1_r5_b6.jpg
1       id13_v2_r5_b6.jpg
2       id13_v1_r6_b5.jpg
3       id13_v2_r6_b5.jpg
4       id13_v1_r5_b6.jpg
              ...        
3995    id16_v2_r5_b6.jpg
3996    id16_v1_r6_b5.jpg
3997    id16_v2_r6_b5.jpg
3998    id16_v1_r6_b5.jpg
3999    id16_v2_r6_b5.jpg
Name: filename, Length: 4000, dtype: object

In [4]:
# break down the values in the filename column into three parts: scene_id, n_red, n_blue
# for instance id11_r5_b6.png will have scene_id=11, n_red=5, n_blue=6
# these three parts should be added to the dataframe as three new columns
named_pattern = re.compile(r'^id(?P<scene_id>\d+)_v(?P<version>\d+)_r(?P<red_count>\d+)_b(?P<blue_count>\d+)\..+$')
parsed_info = [{k: int(v) if k != 'scene_id' else v for k, v in re.match(named_pattern, fn).groupdict().items()} for fn in df.filename]
dfn = df.reset_index()
df['scene_id'] = dfn.apply(lambda row: int(parsed_info[row['index']]['scene_id']), axis=1)
df['red_count'] = dfn.reset_index().apply(lambda row: int(parsed_info[row['index']]['red_count']), axis=1)
df['blue_count'] = dfn.reset_index().apply(lambda row: int(parsed_info[row['index']]['blue_count']), axis=1)

# add a column called expected_side which will can be either red or blue or neither depending on which color has a larger value
df['expected_side'] = df.apply(lambda x: 'red' if x['red_count'] > x['blue_count'] else 'blue' if x['red_count'] < x['blue_count'] else 'neither', axis=1)

In [5]:
df.head()

Unnamed: 0,subject_id,trial_no,filename,gender,age,experience,favorite_team,favorite_color,sim_psucc,impaired_vision,response_time,response_side,scene_id,red_count,blue_count,expected_side
0,0,0,id13_v1_r5_b6.jpg,f,33,more,blue,neither,0.534579,False,1222.045347,red,13,5,6,blue
1,0,41,id13_v2_r5_b6.jpg,f,33,more,blue,neither,0.668224,False,993.388525,blue,13,5,6,blue
2,1,28,id13_v1_r6_b5.jpg,f,27,more,neither,neither,0.627681,False,777.752739,red,13,6,5,red
3,1,61,id13_v2_r6_b5.jpg,f,27,more,neither,neither,0.627681,False,914.927435,blue,13,6,5,red
4,2,33,id13_v1_r5_b6.jpg,f,35,more,blue,blue,0.743885,False,1321.952857,blue,13,5,6,blue


In [6]:
df.to_csv('synthetic_enriched_gt.csv', index=None)