In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

Here we want to create a validation set from the train set. We want a 85% train and 15%, keeping the labels proportionality

In [2]:
TRAIN_CSV = 'EPIC_100_train.csv'
VAL_CSV = 'EPIC_100_validation.csv'

### Original df

In [3]:
df = pd.read_csv(TRAIN_CSV)
df.head(5)

Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
0,P01_01_0,P01,P01_01,00:00:01.089,00:00:00.14,00:00:03.37,8,202,open door,open,3,door,3,['door'],[3]
1,P01_01_1,P01,P01_01,00:00:02.629,00:00:04.37,00:00:06.17,262,370,turn on light,turn-on,6,light,114,['light'],[114]
2,P01_01_10,P01,P01_01,00:00:23.340,00:00:24.97,00:00:26.20,1498,1572,open drawer,open,3,drawer,8,['drawer'],[8]
3,P01_01_100,P01,P01_01,00:07:57.919,00:07:59.75,00:08:00.88,28785,28852,take cup,take,0,cup,13,['cup'],[13]
4,P01_01_101,P01,P01_01,00:08:00.020,00:08:01.47,00:08:02.21,28888,28932,open cupboard,open,3,cupboard,3,['cupboard'],[3]


In [4]:
df.shape

(67217, 15)

In [5]:
df['verb_class'].apply(lambda x: int(x))

0         3
1         6
2         3
3         0
4         3
         ..
67212     8
67213     0
67214     9
67215    30
67216    30
Name: verb_class, Length: 67217, dtype: int64

In [6]:
df = df.drop(df[df['video_id'] == 'P23_04'].index)

## Create splits and new train/val dfs

In [7]:
verbs = df['verb_class'].unique()

In [8]:
df_verb_train_list = []
df_verb_val_list = []
for verb in verbs:
    if len(df[df['verb_class'] == verb]) >= 2:
        df_verb = df[df['verb_class'] == verb]
        df_verb_train, df_verb_val = train_test_split(df_verb, test_size=0.15, train_size=0.85, random_state=1, shuffle=True)
        df_verb_train_list.append(df_verb_train)
        df_verb_val_list.append(df_verb_val)
    else:
        print(f'verb {verb} skipped. Only {len(df[df["verb_class"] == verb])} values')

verb 93 skipped. Only 1 values


In [9]:
df_train = pd.concat(df_verb_train_list)
df_val = pd.concat(df_verb_val_list)

In [10]:
df_train['verb_class'].value_counts()

verb_class
0     12580
1     10381
2      5865
3      4134
4      2954
      ...  
92        3
94        2
95        1
96        1
90        1
Name: count, Length: 96, dtype: int64

We have to make the classes to be in order of [0, 1, 2, .., 96]. Now in the last labels we have [0, ..., 92, 94, 95, 96] which raises an error.

In [11]:
for num in [94, 95, 96]:
    for index in df_train['verb_class'][df_train['verb_class'] == num].index:
        df_train.loc[index, 'verb_class'] = num-1

In [12]:
for num in [94, 95, 96]:
    for index in df_val['verb_class'][df_val['verb_class'] == num].index:
        df_val.loc[index, 'verb_class'] = num-1

In [13]:
df_train['verb_class'].unique()

array([ 3,  6,  0,  5,  1,  4, 10,  7, 59, 77, 13, 23,  9, 38, 17, 28, 12,
        2, 72, 16, 35, 60,  8, 37, 18, 14, 11, 20, 39, 31, 15, 41, 22, 45,
       24, 42, 19, 34, 51, 27, 53, 52, 48, 47, 87, 64, 71, 49, 32, 21, 33,
       56, 44, 73, 25, 63, 30, 36, 82, 67, 26, 50, 89, 46, 61, 70, 76, 68,
       74, 40, 55, 86, 43, 91, 57, 66, 83, 93, 81, 29, 65, 58, 78, 62, 94,
       54, 85, 88, 69, 75, 79, 84, 80, 92, 95, 90], dtype=int64)

In [14]:
df_val['verb_class'].unique()

array([ 3,  6,  0,  5,  1,  4, 10,  7, 59, 77, 13, 23,  9, 38, 17, 28, 12,
        2, 72, 16, 35, 60,  8, 37, 18, 14, 11, 20, 39, 31, 15, 41, 22, 45,
       24, 42, 19, 34, 51, 27, 53, 52, 48, 47, 87, 64, 71, 49, 32, 21, 33,
       56, 44, 73, 25, 63, 30, 36, 82, 67, 26, 50, 89, 46, 61, 70, 76, 68,
       74, 40, 55, 86, 43, 91, 57, 66, 83, 93, 81, 29, 65, 58, 78, 62, 94,
       54, 85, 88, 69, 75, 79, 84, 80, 92, 95, 90], dtype=int64)

In [15]:
# shuffle the rows
df_train = df_train.sample(frac = 1)
df_val = df_val.sample(frac = 1)

In [16]:
print(f'df train shape: {df_train.shape}')
print(f'df val shape: {df_val.shape}')

df train shape: (56930, 15)
df val shape: (10098, 15)


In [17]:
df_train['verb_class'].value_counts()

verb_class
0     12580
1     10381
2      5865
3      4134
4      2954
      ...  
92        3
93        2
94        1
90        1
95        1
Name: count, Length: 96, dtype: int64

In [18]:
df_val['verb_class'].value_counts()

verb_class
0     2221
1     1833
2     1035
3      730
4      522
      ... 
90       1
93       1
94       1
91       1
95       1
Name: count, Length: 96, dtype: int64

In [19]:
df_val.head()

Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
48316,P24_07_33,P24,P24_07,00:01:47.180,00:01:47.43,00:01:48.14,6445,6488,put tomato,put,1,tomato,43,['tomato'],[43]
36989,P22_06_52,P22,P22_06,00:02:10.289,00:02:07.88,00:02:08.87,7672,7732,open coffee maker,open,3,maker:coffee,50,['maker:coffee'],[50]
63593,P31_08_30,P31,P31_08,00:04:43.059,00:04:41.97,00:04:47.40,16918,17244,take tuna can,take,0,tuna,160,['tuna'],[160]
29056,P08_23_133,P08,P08_23,00:13:13.500,00:13:07.70,00:13:16.55,47262,47793,cleaning kitchen surface,clean,2,surface:kitchen,42,['surface:kitchen'],[42]
20945,P04_113_499,P04,P04_113,00:19:01.853,00:19:00.31,00:19:02.16,57015,57108,close oven,close,4,oven,46,['oven'],[46]


In [20]:
df_train.to_csv('train.csv')
df_val.to_csv('val.csv')

## weight for classes exporation

In [None]:
unique_labels = np.array(df_train['verb_class'].unique())
unique_labels.sort()
all_labels = np.array(df_train['verb_class'])

In [None]:
print(unique_labels)

In [None]:
all_labels

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=unique_labels, y=all_labels)

In [None]:
class_weights

In [None]:
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

In [None]:
class_weights_tensor.dtype

In [None]:
criterion = nn.CrossEntropyLoss(weight=class_weights)