In [2]:
import numpy as np
import pandas as pd
import sklearn


### Load the concepts file to get respective labels

In [5]:
concepts = pd.read_csv('concepts_2011.txt', sep='\t')


In [34]:
## Get indexes of 
names = concepts['name'].to_numpy()
# short for seasonal indexes
s_inds = {}

for ind, n in enumerate(names):
    if n == 'Summer':
        s_inds[n] = ind
    if n == 'Autumn': 
        s_inds[n] = ind
    if n == 'Spring': 
        s_inds[n] = ind
    if n == 'Winter':
        s_inds[n] = ind

### Function for splitting

In [186]:
def train_val_test(arr, train_per, val_per):
    # arr: your array
    # train_per: train percentage
    # val_per: validation percentage

    
    arr_len = len(arr)
    # first index to cut at
    first = int(np.floor(train_per/100 * arr_len))
    # second index to cut at
    second = int(np.floor((train_per+val_per)/100 * arr_len))
    
    ## Split up the arr
    train_s = arr[0:first]
    val_s= arr[first:second]
    test_s = arr[second:arr_len]
    
    return train_s, val_s, test_s

### Create table with headers ranging from index 0 to 98

In [51]:
# Get the column names
indexes = range(len(concepts))
img_name = 'image'
column_names = [img_name] + list(indexes)

In [52]:
# Load the trainset annotations file
trainset = pd.read_csv('trainset_gt_annotations.txt', header=None, names=column_names, sep=' ')

In [53]:
trainset

Unnamed: 0,image,0,1,2,3,4,5,6,7,8,...,89,90,91,92,93,94,95,96,97,98
0,0039b5a7-c1ad-423a-92a0-3f38558043a2.jpg,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,003eaf28-898d-404f-abe5-e8e86d422fa2.jpg,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
2,005f4848-780a-4d31-8c09-4abdfd46804c.jpg,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,1,1
3,00a53424-5a0d-4c9d-ac90-242a6099fc35.jpg,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,00b5c2a5-2b8f-492b-ad5d-ed3310e6da56.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,{FFA81171-399C-432E-8373-753B7FC5F85B}.jpg,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7996,{FFB02EA2-96FB-4C60-ADEF-C57C0CC2266E}.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7997,{FFB61D93-7A87-487B-8478-58B78E878823}.jpg,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7998,{FFC6F015-ABF9-4AD9-ACC9-0941FD14A251}.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Filter out Spring Summer Winter Autumn images

In [54]:
# Filter out the images that dont fall within seasons
season_imgs = trainset[(trainset[s_inds['Spring']] == 1) | (trainset[s_inds['Summer']] == 1) | (trainset[s_inds['Autumn']] == 1) | (trainset[s_inds['Winter']] == 1)]
season_imgs = season_imgs.reset_index().drop(columns='index')

In [56]:
## Now select the columns that matter
season_imgs = season_imgs[[img_name, s_inds['Spring'], s_inds['Summer'], s_inds['Autumn'], s_inds['Winter']]]


In [60]:
s_arr = season_imgs.to_numpy()

In [185]:
spring, summer, autumn, winter = [], [], [], []

for x in s_arr:
    if x[1] == 1:
        spring.append([x[0], 'spring'])
    elif x[2] == 1:
        summer.append([x[0], 'summer'])
    elif x[3] == 1:
        autumn.append([x[0], 'autumn'])
    elif x[4] == 1:
        winter.append([x[0], 'winter'])

# Now to build our training, validation and test sets class-wise with 60, 15 and 25 percent respectively

In [187]:
# Use splitting function to split
spring_split = train_val_test(spring, 60, 15)
summer_split = train_val_test(summer, 60, 15)
autumn_split = train_val_test(autumn, 60, 15)
winter_split = train_val_test(winter, 60, 15)

In [188]:
# Concat the values for train val and test into temporary lists 
train_temp = spring_split[0] + summer_split[0] + autumn_split[0] + winter_split[0]
val_temp = spring_split[1] + summer_split[1] + autumn_split[1] + winter_split[1]
test_temp = spring_split[2] + summer_split[2] + autumn_split[2] + winter_split[2]

In [189]:
### Load the image npy files into each respective set
# Train Set
x_train = np.empty((0,1024))
y_train = []
for x in train_temp:
    x_train = np.append(x_train, [np.load(f'imageclef2011_feats/{x[0]}_ft.npy')], axis=0)
    y_train.append(x[1])
y_train = np.array(y_train)

np.save(f'train_val_test_sets/x_train.npy', x_train)
np.save(f'train_val_test_sets/y_train.npy', y_train)

# Validation Set
x_val = np.empty((0,1024))
y_val = []
for x in val_temp:
    x_val = np.append(x_val, [np.load(f'imageclef2011_feats/{x[0]}_ft.npy')], axis=0)
    y_val.append(x[1])
y_val = np.array(y_val)

np.save(f'train_val_test_sets/x_val.npy', x_val)
np.save(f'train_val_test_sets/y_val.npy', y_val)

# Test Set
x_test = np.empty((0,1024))
y_test = []
for x in test_temp:
    x_test = np.append(x_test, [np.load(f'imageclef2011_feats/{x[0]}_ft.npy')], axis=0)
    y_test.append(x[1])
y_test = np.array(y_test)

np.save(f'train_val_test_sets/x_test.npy', x_test)
np.save(f'train_val_test_sets/y_test.npy', y_test)
