In [1]:
# Import library
import numpy as np
import os
import sys
from tqdm import tqdm
import time
import wave
import copy
import math
#import ffmpeg
import cv2

from sklearn.preprocessing import label_binarize

In [2]:
# import function from utils.py
from utils import *

In [3]:
cv2.getBuildInformation()



In [4]:
# List of common codecs to check
codecs = ['DIVX', 'XVID', 'MJPG', 'X264', 'MP4V', 'H264']

print("Available codecs for OpenCV on this system:")
for codec in codecs:
    try:
        fourcc = cv2.VideoWriter_fourcc(*codec)
        if fourcc:
            print(f"Codec {codec} is available.")
    except:
        print(f"Codec {codec} is not available.")


Available codecs for OpenCV on this system:
Codec DIVX is available.
Codec XVID is available.
Codec MJPG is available.
Codec X264 is available.
Codec MP4V is available.
Codec H264 is available.


In [5]:
data_path = "../IEMOCAP/"
sessions = ['Session1', 'Session2', 'Session3', 'Session4', 'Session5']
framerate = 16000

In [10]:
def split_avi(avi, emotions, params=Constants(),batch_size=32):
    (framerate, frame_count, width, height), vr = avi

    frames_segments = []

    for ie, e in enumerate(emotions):
        start = e['start']
        end = e['end']
        id = e['id']
        direction = "right" if id[5] != id[-4] else "left"

        # Set crop dimensions based on direction
        crop_x = 360 if direction == "right" else 0
        crop_y, crop_w, crop_h = 120, 360, 240

        # Calculate frame indices
        start_frame_idx = int(start * framerate)
        end_frame_idx = int(end * framerate)
        start_frame_idx = max(0, min(start_frame_idx, frame_count - 1))
        end_frame_idx = max(0, min(end_frame_idx, frame_count))
        if start_frame_idx >= end_frame_idx:
            continue
        
        frames_list = []
        # Process in batches
        for batch_start in range(start_frame_idx, end_frame_idx, batch_size):
            batch_end = min(batch_start + batch_size, end_frame_idx)
            frame_indices = list(range(batch_start, batch_end))
            
            # Read batch
            batch_frames = vr.get_batch(frame_indices).asnumpy()
            
            # Crop entire batch at once
            # batch_frames shape is (batch_size, height, width, channels)
            batch_frames = batch_frames[:,crop_y:crop_y+crop_h,crop_x:crop_x+crop_w, :]
            # Apply color correction to entire batch
            processed_frames = []
            for frame in batch_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR
                if direction == "left":
                    frame = frame.astype(float)
                    frame[..., 2] *= 0.77  # Reduce red channel
                    frame = np.clip(frame, 0, 255).astype(np.uint8)
                processed_frames.append(frame)
            
            # Convert back to batch format
            batch_frames = np.stack(processed_frames)
            frames_list.append(batch_frames)
        
        # Concatenate all batches
        if frames_list:
            all_frames = np.concatenate(frames_list, axis=0)
            frames_segments.append({'frames': all_frames})
    del vr
    
    return frames_segments

In [11]:
# Process first session
data = []
ids = {}
avi_sample = None
pbar = tqdm(total=1, unit='file', ncols=100)
files = []
for session in sessions[:1]: 
        pbar.set_description(f'Processing {session}')
        path_to_avi = data_path + session + '/dialog/avi/DivX/'
        path_to_wav = data_path + session + '/dialog/wav/'
        path_to_emotions = data_path + session + '/dialog/EmoEvaluation/'
        path_to_transcriptions = data_path + session + '/dialog/transcriptions/'

        files_ref = os.listdir(path_to_wav)
        for f in files_ref: 
            if f.endswith('.wav'):
                files.append(f[:-4])
        # if (f== 'Ses05M_script01_1b'):
        #     mocap_f = 'Ses05M_script01_1' 
        
        pbar.update(1)

Processing Session1: 100%|██████████████████████████████████████████| 1/1 [00:29<00:00, 29.91s/file]


In [None]:
# Try to split the first file
for f in files[:1]:
    # wav = get_audio(path_to_wav, f + '.wav')
    avi = get_avi(path_to_avi, f + '.avi')
    # transcriptions = get_transcriptions(path_to_transcriptions, f + '.txt')
    emotions = get_emotions(path_to_emotions, f + '.txt')
    # sample = split_wav(wav, emotions)
    avi_sample = split_avi(avi, emotions)
    print(emotions[17])
    break

In [29]:
# length of utterance in one video
len(avi_sample)

103

In [None]:
# Check the shape of the frames and the corresponding emotion of the index
i = 102
print(avi_sample[i]["frames"].shape, emotions[i])

In [None]:
output_path = 'cropped_video.avi'
fourcc = cv2.VideoWriter_fourcc(*'XVID')
fps = 29.97
out = cv2.VideoWriter(output_path, fourcc, fps, (360, 240))

for frame in avi_sample[i]["frames"]:
    out.write(frame)

out.release()

In [10]:
# Test get_emotions function
get_emotions(path_to_emotions, filename)[10:13]

[{'start': 78.1061,
  'end': 87.7428,
  'id': 'Ses02F_impro01_F010',
  'v': 2.0,
  'a': 3.5,
  'd': 2.0,
  'emotion': 'ang',
  'emo_evo': [0.16666666666666666,
   0.0,
   0.0,
   0.16666666666666666,
   0.0,
   0.6666666666666666,
   0.0,
   0.0,
   0.0,
   0.0]},
 {'start': 88.3022,
  'end': 89.9978,
  'id': 'Ses02F_impro01_F011',
  'v': 2.0,
  'a': 4.0,
  'd': 3.0,
  'emotion': 'ang',
  'emo_evo': [0.2, 0.0, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0]},
 {'start': 96.4709,
  'end': 99.7098,
  'id': 'Ses02F_impro01_F012',
  'v': 2.0,
  'a': 3.5,
  'd': 2.5,
  'emotion': 'dis',
  'emo_evo': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.6000000000000001,
   0.0,
   0.0,
   0.2,
   0.2]}]