In [None]:
## Global flags

# Set to True to remove clips with unusually long or short durations. Is expected to remove 20 clips.
REMOVE_OUTLIERS = True

# How to select the frames that represent each video. Set to True to select random frames. Set to False to select frames at evenly spaced intervals.
# Recommended: False
SELECT_FRAMES_RANDOMLY = False

# Set to True to delete audio proxies after preprocessing to free up disk space. 
# Recommended: True
DELETE_PROXIES = True

In [None]:
import cv2
import random
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import random_split
from tqdm import tqdm
import subprocess
import os
from scipy.io.wavfile import read as wavread
from IPython.display import display

In [None]:
df_accede_scores = pd.read_csv("annotations/ACCEDEaffect.txt", sep="\t")
df_mediaeval_scores = pd.read_csv("annotations/MEDIAEVALaffect.txt", sep="\t")

df_accede_sets = pd.read_csv("annotations/ACCEDEsets.txt", sep="\t")
df_mediaeval_sets = pd.read_csv("annotations/MEDIAEVALsets.txt", sep="\t")

df_scores = pd.concat([df_accede_scores.assign(source='ACCEDE'), df_mediaeval_scores.assign(source='MEDIAEVAL')], ignore_index=True) # Contains valence and arousal classes
df_sets = pd.concat([df_accede_sets, df_mediaeval_sets], ignore_index=True) # Contains data split

# Join dataframes
df_joined = pd.merge(df_scores, df_sets, left_index=True, right_index=True, suffixes=('_scores', '_sets'))

In [None]:
display(df_joined)
print(df_joined.dtypes)

# Preprocess data

## Outlier removal

### Removes clips with unusually long or short durations. Is expected to remove 20 outliers from the extended LIRIS-ACCEDE dataset.

Cell execution may take several minutes.

In [None]:
if REMOVE_OUTLIERS:
    def get_clip_duration(file_id):
            video_load_path = 'data/'+file_id+'.mp4'
            video = cv2.VideoCapture(video_load_path)
            frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
            duration = frame_count / video.get(cv2.CAP_PROP_FPS)
            return duration
        
    # Add clip durations to dataframe
    df_joined['duration'] = df_joined['name_scores'].apply(get_clip_duration)

    # Calculate clip duration IQR
    clip_durations_q1 = np.percentile(df_joined['duration'], 25, interpolation='midpoint')
    clip_durations_q3 = np.percentile(df_joined['duration'], 75, interpolation='midpoint')
    clip_durations_iqr = clip_durations_q3 - clip_durations_q1

    # Calculate inner bounds
    clip_durations_lower_bound = clip_durations_q1 - 1.5 * clip_durations_iqr
    clip_durations_upper_bound = clip_durations_q3 + 1.5 * clip_durations_iqr

    print(f'Lower bound: {clip_durations_lower_bound:.3f} seconds')
    print(f'Upper bound: {clip_durations_upper_bound:.3f} seconds')

    # Remove outliers
    num_rows_before = len(df_joined)

    df_joined = df_joined[(df_joined['duration'] >= clip_durations_lower_bound) & (df_joined['duration'] <= clip_durations_upper_bound)]
    df_joined.reset_index(drop=True, inplace=True)

    num_rows_after = len(df_joined)

    print(f'Removed {num_rows_before - num_rows_after} outliers')

## Video preprocessing

In [None]:
down_width = 384
down_height = 224
down_points = (down_width, down_height)

num_frames = 16 # Number of selected frames

num_rows = len(df_joined.index)

for index, row in tqdm(df_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']
    video_load_path = 'data/'+file_id+'.mp4'
    cam = cv2.VideoCapture(video_load_path)
    total_frames = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))

    if SELECT_FRAMES_RANDOMLY:
        frame_idxs = sorted(random.sample(range(total_frames), num_frames))
    else:
        frame_idxs = np.linspace(0, (total_frames-1), num=num_frames).round().astype('int').tolist()

    frames = []

    for idx, frame_idx in enumerate(frame_idxs):
        
        cam.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cam.read()

        if not ret:
            print('Warning: frame not properly read')
            continue
        else:
            resized_frame = cv2.resize(frame, down_points, interpolation=cv2.INTER_LINEAR)
            frames.append(resized_frame)

    video = np.array(frames)
    tensor_video = torch.from_numpy(video)

    torch.save(tensor_video, 'preprocessed/videos/' + file_id + '.pt')

print('Videos preprocessed and saved to disk')

## Label preprocessing

In [None]:
num_rows = len(df_joined.index)

for index, row in tqdm(df_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']

    # Create 9 classes from 3 valence and 3 arousal classes
    valenceClass = row['valenceClass']
    arousalClass = row['arousalClass']

    if(valenceClass == -1):
        if(arousalClass == -1): vaClass = 0 # negative calm
        elif(arousalClass == 0): vaClass = 1 # negative neutral
        elif(arousalClass == 1): vaClass = 2 # negative active
        else: raise ValueError('Illegal arousal label value')
    elif(valenceClass == 0):
        if(arousalClass == -1): vaClass = 3 # neutral calm
        elif(arousalClass == 0): vaClass = 4 # both neutral
        elif(arousalClass == 1): vaClass = 5 # neutral active
        else: raise ValueError('Illegal arousal label value')
    elif(valenceClass == 1):
        if(arousalClass == -1): vaClass = 6 # positive calm
        elif(arousalClass == 0): vaClass = 7 # positive neutral
        elif(arousalClass == 1): vaClass = 8 # positive active
        else: raise ValueError('Illegal arousal label value')
    else: raise ValueError('Illegal valence label value')

    torch.save(vaClass, 'preprocessed/labels/' + file_id + '.pt')

print('Labels preprocessed and saved to disk')

## Audio preprocessing 

### Create proxy audio files (to speed up processing) and get duration of longest audio (for padding)

In [None]:
audios = []

num_rows = len(df_joined.index)

# Create audio proxies and get longest audio
for index, row in tqdm(df_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']
    video_load_path = 'data/'+file_id+'.mp4'
    audio_proxy_path = 'audio_proxies/'+file_id+'.wav'

    # Create proxy .wav files to speed up audio loading
    if not os.path.exists(audio_proxy_path):
        command = "ffmpeg -i " + video_load_path + " -ab 160k -ac 1 -ar 48000 -vn " + audio_proxy_path
        subprocess.run(command, shell=True, capture_output=True, text=True, input='y')
    
    sr, audio = wavread(audio_proxy_path)
    audios.append(audio)

# Get the duration of the longest audio clip
highest_duration = 0
for audio in audios:
    if len(audio) > highest_duration:
        highest_duration = len(audio)
max_duration = highest_duration
print(f'Maximum duration: {max_duration} data points, or {(max_duration/48_000):.3f} seconds')

del audios

### Pad audio and save to disk

In [None]:
for index, row in tqdm(df_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']
    audio_proxy_path = 'audio_proxies/'+file_id+'.wav'
    sr, audio = wavread(audio_proxy_path)
    
    if sr != 48_000:
        print('Warning: Audio ' + file_id + ' has sample rate of ' + sr + ' but requires 48000')
        
    # Pad audio so that all audios have same duration (necessary for transformation to tensors)
    audio_padded = np.zeros((max_duration))
    for i in range(0, len(audio)-1):
        audio_padded[i] = audio[i]

    torch.save(audio_padded, 'preprocessed/audios/' + file_id + '.pt')
    
print('Audio data saved to disk')

### Delete proxy audio files

Remove proxy files to free up disk space

In [None]:
# Remove audio proxies
if DELETE_PROXIES:
    for index, row in tqdm(df_joined.iterrows(), total=num_rows):
        file_id = row['name_scores']
        audio_proxy_path = 'audio_proxies/'+file_id+'.wav'
        os.remove(audio_proxy_path)
    print('Deleted audio proxies')

## Data split

### Extended LIRIS-ACCEDE data split

Data split as described in ACCEDEsets.txt and MEDIAEVALsets.txt, resulting in a ratio of 22.5% train, 22.5% val, and 55% test. Does **not** correspond to any dataset described in the thesis.

**Note: This data split DOES include the additional instances added in the MediaEval 2015 Affective Impact of Movies Task.**

Use the following parameter to use this data split when calling main.py: ```--dataset holdout```

In [None]:
num_rows = len(df_joined.index)

train_ids = []
test_ids = []
val_ids = []

for index, row in tqdm(df_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']
    set = row['set']
    
    if set == 1: # train
        train_ids.append(file_id)
    elif set == 0: # test
        test_ids.append(file_id)
    elif set == 2: # val
        val_ids.append(file_id)

torch.save(train_ids, 'preprocessed/data_splits/train_ids_holdout.pt')
torch.save(test_ids, 'preprocessed/data_splits/test_ids_holdout.pt')
torch.save(val_ids, 'preprocessed/data_splits/val_ids_holdout.pt')

### Original LIRIS-ACCEDE data split

Data split as described in ACCEDEsets.txt only, resulting in a ratio of 25% train, 25% val, and 50% test. Corresponds to the **LIRIS-ACCEDE with AIMT15 labels** dataset described in the thesis.

**Note: This data split does NOT include the additional instances added in the MediaEval 2015 Affective Impact of Movies Task.**

Use the following parameter to use this data split when calling main.py: ```--dataset holdout_accede```

In [None]:
# ACCEDE only
df_accede_joined = df_joined.loc[df_joined['source'] == 'ACCEDE']

num_rows = len(df_accede_joined.index)

train_ids = []
test_ids = []
val_ids = []

for index, row in tqdm(df_accede_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']
    set = row['set']
    
    if set == 1: # train
        train_ids.append(file_id)
    elif set == 0: # test
        test_ids.append(file_id)
    elif set == 2: # val
        val_ids.append(file_id)

torch.save(train_ids, 'preprocessed/data_splits/train_ids_holdout_accede.pt')
torch.save(test_ids, 'preprocessed/data_splits/test_ids_holdout_accede.pt')
torch.save(val_ids, 'preprocessed/data_splits/val_ids_holdout_accede.pt')

### AIMT15 data split

Data split as described in shots-devset-nl.txt and shots-testset-nl.txt, resulting in a devset(56%) and a testset(44%). Corresponds to the **AIMT15** dataset described in the thesis.

The training and validation split for the devset was not specified in MediaEval 2015. We chose a stratified 80/20 split. In total, the resulting ratio is 45% train, 11% val, and 44% test.

Use the following parameter to use this data split when calling main.py: ```--dataset mediaeval```

In [None]:
df_devset_nl = pd.read_csv("annotations/shots-devset-nl.txt", delimiter='\t', header=None, names=['mediaeval_id', 'name_sets'])
df_testset_nl = pd.read_csv("annotations/shots-testset-nl.txt", delimiter='\t', header=None, names=['mediaeval_id', 'name_sets'])

df_mediaeval_sets = pd.concat([df_devset_nl.assign(mediaeval_set='dev'), df_testset_nl.assign(mediaeval_set='test')], ignore_index=True)

df_joined_mediaeval_sets = pd.merge(df_joined, df_mediaeval_sets, how='left', on='name_sets')

num_rows = len(df_joined_mediaeval_sets.index)

dev_ids = []
test_ids = []

for index, row in tqdm(df_joined_mediaeval_sets.iterrows(), total=num_rows):
    file_id = row['name_scores']
    set = row['mediaeval_set']

    if set == 'dev':
        dev_ids.append(file_id)
    elif set == 'test':
        test_ids.append(file_id)

print(len(dev_ids))
print(len(test_ids))

# Split devset into train and val sets
df_devset_only = df_joined_mediaeval_sets[df_joined_mediaeval_sets['name_scores'].isin(dev_ids)]

train_ids, val_ids = train_test_split(dev_ids, train_size=0.8, test_size=0.2, random_state=42, stratify=df_devset_only[['valenceClass', 'arousalClass']])

torch.save(train_ids, 'preprocessed/data_splits/train_ids_mediaeval.pt')
torch.save(test_ids, 'preprocessed/data_splits/test_ids_mediaeval.pt')
torch.save(val_ids, 'preprocessed/data_splits/val_ids_mediaeval.pt')

### Random data split

Completely random data split with a ratio of 64% train, 16% val, and 20% test.

Use the following parameter to use this data split when calling main.py: ```--dataset random```

In [None]:
num_rows = len(df_joined.index)

file_ids = []

for index, row in tqdm(df_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']
    file_ids.append(file_id)

generator = torch.Generator().manual_seed(42)

train_ids, test_ids, val_ids = random_split(file_ids, [0.64, 0.2, 0.16], generator=generator)

torch.save(train_ids, 'preprocessed/data_splits/train_ids_random.pt')
torch.save(test_ids, 'preprocessed/data_splits/test_ids_random.pt')
torch.save(val_ids, 'preprocessed/data_splits/val_ids_random.pt')

### Stratified random data split

Random data split stratified across valence and arousal class labels, with a ratio of 64% train, 16% val, and 20% test.

Use the following parameter to use this data split when calling main.py: ```--dataset stratified```

In [None]:
num_rows = len(df_joined)

file_ids = []

for index, row in tqdm(df_joined.iterrows(), total=num_rows):
    file_id = row['name_scores']
    file_ids.append(file_id)

remaining_ids, test_ids = train_test_split(file_ids, train_size=0.8, test_size=0.2, random_state=42, stratify=df_joined[['valenceClass', 'arousalClass']])

df_remaining = df_joined[df_joined['name_scores'].isin(remaining_ids)]

train_ids, val_ids = train_test_split(remaining_ids, train_size=0.8, test_size=0.2, random_state=42, stratify=df_remaining[['valenceClass', 'arousalClass']])

torch.save(train_ids, 'preprocessed/data_splits/train_ids_stratified.pt')
torch.save(test_ids, 'preprocessed/data_splits/test_ids_stratified.pt')
torch.save(val_ids, 'preprocessed/data_splits/val_ids_stratified.pt')