In [2]:
import json
import pandas as pd  
import numpy as np
import glob

In [38]:
# Dataframe containing the ID of the images in train01
images_in_train01 = glob.glob("Datasets\\CrowdHuman\\CrowdHuman_train01 (1)\\Images\\*")
images_in_train01 = [i.replace('Datasets\\CrowdHuman\\CrowdHuman_train01 (1)\\Images\\', '') for i in images_in_train01]
# images_in_train01 = glob.glob("Datasets\\CrowdHuman\\CrowdHuman_train01\\Images\\*")
# images_in_train01 = [i.replace('Datasets\\CrowdHuman\\CrowdHuman_train01\\Images\\', '') for i in images_in_train01]
images_in_train01 = pd.DataFrame({'ID2': images_in_train01})

# Dataframe containing the ID of the images in the subset of train01 (1000 images)
images_in_train_subset = glob.glob("Datasets\\CrowdHuman\\CrowdHuman_train_subset_1000\\Images\\*")
images_in_train_subset = [i.replace('Datasets\\CrowdHuman\\CrowdHuman_train_subset_1000\\Images\\', '') for i in images_in_train_subset]
images_in_train_subset = pd.DataFrame({'ID2': images_in_train_subset})

# Dataframe containing the ID of the images in the subset of val (600 images)
images_in_val_subset = glob.glob("Datasets\\CrowdHuman\\CrowdHuman_val_subset_600\\Images\\*")
images_in_val_subset = [i.replace('Datasets\\CrowdHuman\\CrowdHuman_val_subset_600\\Images\\', '') for i in images_in_val_subset]
images_in_val_subset = pd.DataFrame({'ID2': images_in_val_subset})

In [4]:
# Open odgt files
with open('Datasets/CrowdHuman/annotation_train.odgt') as f:
    lines_train = f.readlines()
with open('Datasets/CrowdHuman/annotation_val.odgt') as f:
    lines_val = f.readlines()

In [20]:
def odgt_to_dataframe(line, images_path):
    '''
    This function extracts for every line of the odgt file the ID of the image, the the tags ('person' or 'mask'), and the full bounding box related to each of the tags.
    
    Input: a line from the odgt file
    Output: a dataframe  
    '''
    # Extract tag and ID
    df = pd.json_normalize(json.loads(line)['gtboxes'])
    df['ID'] = images_path+json.loads(line)['ID']+'.jpg'
    df['ID2'] = json.loads(line)['ID']+'.jpg'

    # convert from x,y,w,h to x1,y1,x2,y2
    df1 = pd.DataFrame(df['fbox'].to_list(), columns = ['x1', 'y1', 'w', 'h'])
    df1['x2'] = df1['x1'] + df1['w'] 
    df1['y2'] = df1['y1'] + df1['h']
    df1 = df1.drop(columns=['w','h'])

    # Putting coordinates, tags and ID together
    df1['tag'] = df['tag']
    df1['ID'] = df['ID']
    df1['ID2'] = df['ID2']
    
    return df1


In [39]:
# Apply odgt_to_dataframe to each line and save all results in a dataset 

# Training set

train_path = 'Datasets/CrowdHuman/CrowdHuman_train01/Images/'
train0 = pd.concat([odgt_to_dataframe(line = l, images_path = train_path) for l in lines_train])
# keep only the information from images in train01
train = pd.merge(train0,images_in_train01,on='ID2') 
first_column = train.pop('ID')
train.insert(0, 'ID', first_column)
train = train.replace({'tag': 'mask'}, 'ignore')
# keep only information from images that have the ignore category
train_with_ignore = pd.DataFrame({'ID2': np.unique(train['ID2'][train['tag'] == 'ignore'])})
train = pd.merge(train,train_with_ignore,on='ID2')
train = train.drop(columns=['ID2'])
print(train.shape)

# Validation set

val_path = 'Datasets/CrowdHuman/CrowdHuman_val/Images/'
val = pd.concat([odgt_to_dataframe(line = l, images_path = val_path) for l in lines_val])
first_column = val.pop('ID')
val.insert(0, 'ID', first_column)
val = val.replace({'tag': 'mask'}, 'ignore')
# keep only information from images that have the ignore category
val_with_ignore = pd.DataFrame({'ID2': np.unique(val['ID2'][val['tag'] == 'ignore'])})
val = pd.merge(val,val_with_ignore,on='ID2')
val = val.drop(columns=['ID2'])
print(val.shape)

# Training set subset of 1000 images

train_subset_path = 'Datasets/CrowdHuman/CrowdHuman_train_subset_1000/Images/'
train_subset0 = pd.concat([odgt_to_dataframe(line = l, images_path = train_subset_path) for l in lines_train])
# keep only the information from images in train subset
train_subset = pd.merge(train_subset0,images_in_train_subset,on='ID2') 
first_column = train_subset.pop('ID')
train_subset.insert(0, 'ID', first_column)
train_subset = train_subset.replace({'tag': 'mask'}, 'ignore')
# keep only information from images that have the ignore category
train_subset_with_ignore = pd.DataFrame({'ID2': np.unique(train_subset['ID2'][train_subset['tag'] == 'ignore'])})
train_subset = pd.merge(train_subset,train_subset_with_ignore,on='ID2')
train_subset = train_subset.drop(columns=['ID2'])
print(train_subset.shape)

# Validation set subset of 600 images

val_subset_path = 'Datasets/CrowdHuman/CrowdHuman_val_subset_600/Images/'
val_subset0 = pd.concat([odgt_to_dataframe(line = l, images_path = val_subset_path) for l in lines_val])
# keep only the information from images in train subset
val_subset = pd.merge(val_subset0,images_in_val_subset,on='ID2') 
first_column = val_subset.pop('ID')
val_subset.insert(0, 'ID', first_column)
val_subset = val_subset.replace({'tag': 'mask'}, 'ignore')
# keep only information from images that have the ignore category
val_subset_with_ignore = pd.DataFrame({'ID2': np.unique(val_subset['ID2'][val_subset['tag'] == 'ignore'])})
val_subset = pd.merge(val_subset,val_subset_with_ignore,on='ID2')
val_subset = val_subset.drop(columns=['ID2'])
print(val_subset.shape)

# Sample of 5 images from validation

val_small_sample = pd.merge(val_subset,pd.DataFrame({'ID':np.unique(val_subset['ID'])[0:4]}), on='ID')

(109188, 6)
(116606, 6)
(22166, 6)
(12957, 6)


In [34]:
# Dataframe with the classes (person and mask)
classes = np.unique(train['tag'])
classes = pd.DataFrame({'a':classes, 'b': range(len(classes))})

In [40]:
# save in txt
train.to_csv('Datasets/CrowdHuman/train.txt', header=None, index=None, sep=' ')
val.to_csv('Datasets/CrowdHuman/val.txt', header=None, index=None, sep=' ')

train_subset.to_csv('Datasets/CrowdHuman/train_subset.txt', header=None, index=None, sep=' ')
val_subset.to_csv('Datasets/CrowdHuman/val_subset.txt', header=None, index=None, sep=' ')

val_small_sample.to_csv('Datasets/CrowdHuman/val_small_sample.txt', header=None, index=None, sep=' ')

classes.to_csv('Datasets/CrowdHuman/classes.txt', header=None, index=None, sep=' ')