# Create label csv file

The original label file is a text file with space separation this is less ideal to work with so we will convert to a pandas friendly csv

In [1]:
import pandas as pd

## Utility

In [16]:
def convert_line_to_dict(line: str) -> dict:
    parts = line.split(' ')
    return {
        'image': parts[0],
        'label': int(parts[1]),
        'type': parts[2]
    }

def convert_to_pd(ldicts: list) -> pd.DataFrame:
    images = []
    labels = []
    ltypes = []

    for ldict in ldicts:
        images.append(ldict['image'])
        labels.append(ldict['label'])
        ltypes.append(ldict['type'])

    return pd.DataFrame({
        'image': images,
        'label': labels,
        'type': ltypes
    })

## Setup

In [8]:
root = '.'

label_file = f'{root}/labels-map-proj_v3_2_train_val_test.txt'
image_dir = f'{root}/map-proj-v3_2'

## Load labels

In [10]:
with open(label_file, 'r') as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]

## To Pandas

In [11]:
labels = [convert_line_to_dict(line) for line in lines]

In [17]:
df = convert_to_pd(labels)

In [18]:
df.head()

Unnamed: 0,image,label,type
0,ESP_013049_0950_RED-0067.jpg,7,train
1,ESP_013049_0950_RED-0067-fv.jpg,7,train
2,ESP_013049_0950_RED-0067-brt.jpg,7,train
3,ESP_013049_0950_RED-0067-r90.jpg,7,train
4,ESP_013049_0950_RED-0067-r180.jpg,7,train


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67810 entries, 0 to 67809
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   67810 non-null  object
 1   label   67810 non-null  int64 
 2   type    67810 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


In [19]:
df.to_csv('labels.csv', index=False)

## Filter Other Label

In [23]:
proj_df = df[df['label'] != 0]

In [24]:
proj_df.head()

Unnamed: 0,image,label,type
0,ESP_013049_0950_RED-0067.jpg,7,train
1,ESP_013049_0950_RED-0067-fv.jpg,7,train
2,ESP_013049_0950_RED-0067-brt.jpg,7,train
3,ESP_013049_0950_RED-0067-r90.jpg,7,train
4,ESP_013049_0950_RED-0067-r180.jpg,7,train


In [25]:
proj_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15088 entries, 0 to 67774
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   15088 non-null  object
 1   label   15088 non-null  int64 
 2   type    15088 non-null  object
dtypes: int64(1), object(2)
memory usage: 471.5+ KB


In [26]:
proj_df.to_csv('proj_labels.csv', index=False)