In [24]:
from pathlib import Path
import pandas as pd

# General setup

In [25]:
storage_path = Path('/storage')
data_path = storage_path/'Columbia Gaze Data Set'

def parse_filepath(filepath):
    filename = filepath.stem
    
    _,_,pos_str,v_str,h_str = filename.split('_')
    pos = int(pos_str[:-1])
    v = int(v_str[:-1])
    h = int(h_str[:-1])

    return pos, v, h

- v: -10, 0, 10
- h: -15, -10, -5, 0, 5, 10, 15

In [26]:
folders = [element for element in data_path.glob('*') if element.is_dir()]
len(folders)

56

In [27]:
# use n_valid folders for validation set
n_valid = 10

In [28]:
train_folders = folders[:-n_valid]
validation_folders = folders[-n_valid:]
print(f'Training on {len(train_folders)} folders.')
print(f'Validation on {len(validation_folders)} folders.')

Training on 46 folders.
Validation on 10 folders.


## Labels from parsed filenames

In [29]:
def label_from_position(vertical, horizontal, join_char=' '):
    label = []
    
    if vertical==-10: label.append('Bottom')
    if vertical==10: label.append('Top')
    if vertical==0: label.append('VCenter')
    
    if horizontal in [-15, -10]: label.append('Left')
    if horizontal in [15, 10]: label.append('Right')
    if horizontal in [-5, 0, 5]: label.append('HCenter')
        
    return join_char.join(label)

In [30]:
parsed = parse_filepath(list(train_folders[0].glob('*.jpg'))[0])
parsed, label_from_position(*parsed[1:])

((0, 10, 10), 'Top Right')

In [40]:
def create_label_df(train_folders, validation_folders, lbl_func):
    content = []
    for train_folder in train_folders:
        for file in train_folder.glob('*.jpg'):
            _, v, h = parse_filepath(file)
            label = lbl_func(v, h)
            if label:
                content.append([file, label, False])
            
    for validation_folder in validation_folders:
        for file in validation_folder.glob('*.jpg'):
            _, v, h = parse_filepath(file)
            label = lbl_func(v, h)
            if label:
                content.append([file, label, True])
            
    df = pd.DataFrame(content, columns=['path', 'target', 'is_valid'])
    return df

In [9]:
df = create_label_df(train_folders, validation_folders, label_from_position)
df.head()

Unnamed: 0,path,target,is_valid
0,/storage/Columbia Gaze Data Set/0016/0016_2m_0...,Top Right,False
1,/storage/Columbia Gaze Data Set/0016/0016_2m_1...,Top Left,False
2,/storage/Columbia Gaze Data Set/0016/0016_2m_-...,Bottom HCenter,False
3,/storage/Columbia Gaze Data Set/0016/0016_2m_-...,Top Left,False
4,/storage/Columbia Gaze Data Set/0016/0016_2m_1...,VCenter Right,False


In [10]:
df.is_valid.value_counts()

False    4830
True     1050
Name: is_valid, dtype: int64

In [11]:
df.to_csv(storage_path/'directions_with_center.csv', index=False)

# Easier dataset: left/right/center

In [32]:
def lr_from_position(vertical, horizontal, join_char=' '):
    label = []
    
    if horizontal in [-15, -10]: label.append('Left')
    if horizontal in [15, 10]: label.append('Right')
    if horizontal in [-5, 0, 5]: label.append('HCenter')
        
    return join_char.join(label)

In [13]:
df = create_label_df(train_folders, validation_folders, lr_from_position)
df.head()

Unnamed: 0,path,target,is_valid
0,/storage/Columbia Gaze Data Set/0016/0016_2m_0...,Right,False
1,/storage/Columbia Gaze Data Set/0016/0016_2m_1...,Left,False
2,/storage/Columbia Gaze Data Set/0016/0016_2m_-...,HCenter,False
3,/storage/Columbia Gaze Data Set/0016/0016_2m_-...,Left,False
4,/storage/Columbia Gaze Data Set/0016/0016_2m_1...,Right,False


In [14]:
df.to_csv(storage_path/'lr_with_center.csv', index=False)

# Even more easy

In [41]:
def lr_from_position_hard_center(vertical, horizontal, join_char=' '):
    label = []
    
    if horizontal in [-15, -10]: label.append('Left')
    if horizontal in [15, 10]: label.append('Right')
    if horizontal == 0: label.append('HCenter')
        
    return join_char.join(label)

In [42]:
df = create_label_df(train_folders, validation_folders, lr_from_position_hard_center)
df.head()

Unnamed: 0,path,target,is_valid
0,/storage/Columbia Gaze Data Set/0016/0016_2m_0...,Right,False
1,/storage/Columbia Gaze Data Set/0016/0016_2m_1...,Left,False
2,/storage/Columbia Gaze Data Set/0016/0016_2m_-...,Left,False
3,/storage/Columbia Gaze Data Set/0016/0016_2m_1...,Right,False
4,/storage/Columbia Gaze Data Set/0016/0016_2m_-...,Right,False


In [43]:
df.target.value_counts()

Right      1680
Left       1680
HCenter     840
Name: target, dtype: int64

In [44]:
df.to_csv(storage_path/'lr_hard_center.csv', index=False)

# Smaller dataset

In [17]:
rows,_ = df.shape
rows

5880

In [20]:
p = .2
n = int(rows*p)

In [21]:
shortened = df.sample(n)
shortened.shape

(1176, 3)

In [22]:
shortened.is_valid.value_counts()

False    965
True     211
Name: is_valid, dtype: int64

In [23]:
shortened.to_csv(storage_path/'lr_short_20.csv', index=False)