In [6]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split

Here we want to create a validation set from the train set. We want a 85% train and 15%, keeping the labels proportionality

In [2]:
TRAIN_CSV = 'EPIC_100_train.csv'
VAL_CSV = 'EPIC_100_validation.csv'

### Original df

In [3]:
df = pd.read_csv(TRAIN_CSV)
df.head(5)

Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
0,P01_01_0,P01,P01_01,00:00:01.089,00:00:00.14,00:00:03.37,8,202,open door,open,3,door,3,['door'],[3]
1,P01_01_1,P01,P01_01,00:00:02.629,00:00:04.37,00:00:06.17,262,370,turn on light,turn-on,6,light,114,['light'],[114]
2,P01_01_10,P01,P01_01,00:00:23.340,00:00:24.97,00:00:26.20,1498,1572,open drawer,open,3,drawer,8,['drawer'],[8]
3,P01_01_100,P01,P01_01,00:07:57.919,00:07:59.75,00:08:00.88,28785,28852,take cup,take,0,cup,13,['cup'],[13]
4,P01_01_101,P01,P01_01,00:08:00.020,00:08:01.47,00:08:02.21,28888,28932,open cupboard,open,3,cupboard,3,['cupboard'],[3]


In [4]:
df.shape

(67217, 15)

In [18]:
df['verb_class'].apply(lambda x: int(x))

0         3
1         6
2         3
3         0
4         3
         ..
67212     8
67213     0
67214     9
67215    30
67216    30
Name: verb_class, Length: 67217, dtype: int64

In [22]:
df['verb_class'].value_counts()

verb_class
0     14848
1     12225
2      6927
3      4870
4      3483
      ...  
94        3
95        2
96        2
90        2
93        1
Name: count, Length: 97, dtype: int64

In [26]:
verbs = df['verb_class'].unique()

In [31]:
len(df[df['verb_class'] == 1])

12225

## Create splits and new train/val dfs

In [86]:
a = 1920 * 1080
a

2073600

In [33]:
df_verb_train_list = []
df_verb_val_list = []
for verb in verbs:
    if len(df[df['verb_class'] == verb]) >= 2:
        df_verb = df[df['verb_class'] == verb]
        df_verb_train, df_verb_val = train_test_split(df_verb, test_size=0.15, train_size=0.85, random_state=1, shuffle=True)
        df_verb_train_list.append(df_verb_train)
        df_verb_val_list.append(df_verb_val)
    else:
        print(f'verb {verb} skipped. Only {len(df[df["verb_class"] == verb])} values')

verb 93 skipped. Only 1 values


In [37]:
df_train = pd.concat(df_verb_train_list)
df_val = pd.concat(df_verb_val_list)

In [41]:
# shuffle the rows
df_train = df_train.sample(frac = 1)
df_val = df_val.sample(frac = 1)

In [45]:
print(f'df train shape: {df_train.shape}')
print(f'df val shape: {df_val.shape}')

df train shape: (57090, 15)
df val shape: (10126, 15)


In [42]:
df_train['verb_class'].value_counts()

verb_class
0     12620
1     10391
2      5887
3      4139
4      2960
      ...  
92        3
94        2
96        1
95        1
90        1
Name: count, Length: 96, dtype: int64

In [43]:
df_val['verb_class'].value_counts()

verb_class
0     2228
1     1834
2     1040
3      731
4      523
      ... 
96       1
92       1
94       1
95       1
91       1
Name: count, Length: 96, dtype: int64

In [44]:
df_val.head()

Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
50400,P25_107_382,P25,P25_107,00:27:22.859,00:27:22.49,00:27:23.41,82124,82170,take fork,take,0,fork,14,['fork'],[14]
1266,P01_09_324,P01,P01_09,00:24:08.309,00:24:07.70,00:24:09.22,86862,86953,open oil,open,3,oil,31,['oil'],[31]
59210,P30_04_68,P30,P30_04,00:03:25.019,00:03:24.04,00:03:25.39,12242,12323,take saucepan lid,take,0,lid:saucepan,6,['lid:saucepan'],[6]
22640,P04_121_342,P04,P04_121,00:15:29.100,00:15:28.52,00:15:29.21,46426,46460,pick up bowl,pick-up,0,bowl,7,['bowl'],[7]
27740,P08_02_20,P08,P08_02,00:00:56.330,00:00:56.11,00:00:57.91,3366,3474,throw away wrap,throw,13,wrap,107,['wrap'],[107]


In [46]:
df_train.to_csv('train.csv')
df_val.to_csv('val.csv')