# Purpose of this Notebook

To make it easy for the team to train, I'll create a tar.gz file that contains this directory:
```
- data
  - train
    - text
    - video
    - audio
  - valid
    - text
    - video
    - audio
  - test
    - text
    - video
    - audio
```


In [None]:
!pip install tqdm



In [None]:
# so notebook can access the folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!tar -xzvf /content/drive/MyDrive/DS565_Project/IEMOCAP_full_release.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.phseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.phseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.stseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.stseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.syseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.syseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.wdseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.wdseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_sc

# Reading train-valid-test

Loading train-test-split

In [None]:
import os
import json
import pandas as pd
data_folder = '/content/drive/MyDrive/DS565_Project/'
iemocap = os.path.join(data_folder, 'IEMOCAP_full_release')

with open(os.path.join(data_folder, 'metadata.json'), 'r') as f_in:
  metadata = json.load(f_in)

train = pd.read_csv(os.path.join(data_folder, 'train.csv'))
val = pd.read_csv(os.path.join(data_folder, 'val.csv'))
test = pd.read_csv(os.path.join(data_folder, 'test.csv'))

print(
    len(metadata),
    len(train),
    len(val),
    len(test)
)

assert len(metadata) == len(train) + len(val) + len(test)
train.head()

10039 6023 2008 2008


Unnamed: 0,name,emotion
0,Ses02M_impro08_M013,6
1,Ses01M_impro02_M012,8
2,Ses04F_script03_2_M020,10
3,Ses04F_script01_1_F042,0
4,Ses05M_script03_2_M016,4


In [None]:
with open(os.path.join(data_folder, 'label_mapping.json'), 'r') as f_in:
  label_mapping = json.load(f_in)

label_mapping

{'0': 'ang',
 '1': 'dis',
 '2': 'exc',
 '3': 'fea',
 '4': 'fru',
 '5': 'hap',
 '6': 'neu',
 '7': 'oth',
 '8': 'sad',
 '9': 'sur',
 '10': 'xxx'}

Make the output folder

In [None]:
import os

os.makedirs('data')

os.makedirs('data/train')
os.makedirs('data/train/text')
os.makedirs('data/train/video')
os.makedirs('data/train/audio')

os.makedirs('data/valid')
os.makedirs('data/valid/text')
os.makedirs('data/valid/video')
os.makedirs('data/valid/audio')

os.makedirs('data/test')
os.makedirs('data/test/text')
os.makedirs('data/test/video')
os.makedirs('data/test/audio')

In [None]:
# shape of tuple: (timestamp, emotion, session, improv number)
metadata['Ses01F_impro01_F000']

['[6.2901 - 8.2357]', 'neu', 'Session1', 'Ses01F_impro01']

In [None]:
import os
def get_text_video_audio(name:str) -> list:
  """
  Get paths for text, video, audio
  Input: name
  Output: List[(text_path, name), (video_path, timestamp), audio_path]
  """
  IEMOCAP = '/content/IEMOCAP_full_release'
  meta = metadata[name]
  timestamp = meta[0]
  emotion_name = meta[1]
  session = meta[2]
  improv_script = meta[3]

  # text
  text_path = os.path.join(IEMOCAP,
                           session,
                           'dialog',
                           'transcriptions',
                           f'{improv_script}.txt')

  # video
  video_path = os.path.join(IEMOCAP,
                            session,
                            'dialog',
                            'avi',
                            'DivX',
                            f"{improv_script}.avi")

  # audio
  audio_path = os.path.join(IEMOCAP,
                            session,
                            'sentences',
                            'wav',
                            improv_script,
                            name+'.wav')

  return (
      (text_path, name),
      (video_path, timestamp),
      audio_path
  )

train_paths = [get_text_video_audio(x) for x in train['name']]
valid_paths = [get_text_video_audio(x) for x in val['name']]
test_paths = [get_text_video_audio(x) for x in test['name']]

In [None]:
train_paths[0]

(('/content/IEMOCAP_full_release/Session2/dialog/transcriptions/Ses02M_impro08.txt',
  'Ses02M_impro08_M013'),
 ('/content/IEMOCAP_full_release/Session2/dialog/avi/DivX/Ses02M_impro08.avi',
  '[118.8972 - 121.6360]'),
 '/content/IEMOCAP_full_release/Session2/sentences/wav/Ses02M_impro08/Ses02M_impro08_M013.wav')

# Text

In [None]:
# extract text
def extract_text(text_path, name):
  with open(text_path, 'r') as f:
    lines = f.readlines()

  # extract text from transcript if
  # name of text is name
  text = [l.split(' ')[2:] for l in lines if l.split(' ')[0]==name][0]
  text = " ".join(text).strip()
  return text

train_text = [extract_text(*x[0]) for x in train_paths]
valid_text = [extract_text(*x[0]) for x in valid_paths]
test_text = [extract_text(*x[0]) for x in test_paths]

In [None]:
def save_text(text, name, split):
  save_path = os.path.join('data', split, 'text', f"{name}.txt")
  with open(save_path, 'w') as f_out:
    f_out.write(text)

[save_text(text, name, 'train') for name, text in zip(train['name'], train_text)]
[save_text(text, name, 'valid') for name, text in zip(val['name'], valid_text)]
[save_text(text, name, 'test') for name, text in zip(test['name'], test_text)]

print('Done')

Done


# Video

Lol chatgpt helped lol

1. Extract videos and only keep certain frames
2. If Male, keep half left of video, If Female, keep right half
  - Can tell from seeing the name
  - Ses01F_impro01_M007 -> The M007 has M which means Male, then right half

In [None]:
import cv2
from tqdm import tqdm
def extract_video(video_path, timestamp, name, split):
  timestamp_split = timestamp.strip('[').strip(']').split(' - ')

  start_time = float(timestamp_split[0])
  end_time = float(timestamp_split[1])
  duration = end_time - start_time

  cap = cv2.VideoCapture(video_path)
  fps = cap.get(cv2.CAP_PROP_FPS)
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')

  start_frame = int(start_time * fps)
  end_frame = int(end_time * fps)

  out_path = os.path.join(
      'data',
      split,
      'video',
      f'{name}.mp4'
  )

  out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
  # set start frame
  cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

  current_frame = start_frame
  while current_frame <= end_frame:
      ret, frame = cap.read()
      if not ret:
          break
      out.write(frame)
      current_frame += 1

  cap.release()
  out.release()

for i in tqdm(range(len(train['name']))):
  name = train['name'][i]
  paths = train_paths[i]
  video_tuple = paths[1]
  extract_video(*video_tuple, name, 'train')

for i in tqdm(range(len(val['name']))):
  name = val['name'][i]
  paths = valid_paths[i]
  video_tuple = paths[1]
  extract_video(*video_tuple, name, 'valid')

for i in tqdm(range(len(test['name']))):
  name = test['name'][i]
  paths = test_paths[i]
  video_tuple = paths[1]
  extract_video(*video_tuple, name, 'test')


100%|██████████| 6023/6023 [1:06:50<00:00,  1.50it/s]
100%|██████████| 2008/2008 [22:19<00:00,  1.50it/s]
100%|██████████| 2008/2008 [22:03<00:00,  1.52it/s]


# Audio

In [None]:
import shutil
for i in tqdm(range(len(train_paths))):
  name = train['name'][i]
  src_path = train_paths[i][2]
  dest_path = f"data/train/audio/{name}.wav"
  shutil.move(src_path, dest_path)

for i in tqdm(range(len(valid_paths))):
  name = val['name'][i]
  src_path = valid_paths[i][2]
  dest_path = f"data/valid/audio/{name}.wav"
  shutil.move(src_path, dest_path)


for i in tqdm(range(len(test_paths))):
  name = test['name'][i]
  src_path = test_paths[i][2]
  dest_path = f"data/test/audio/{name}.wav"
  shutil.move(src_path, dest_path)


100%|██████████| 6023/6023 [00:00<00:00, 8853.86it/s]
100%|██████████| 2008/2008 [00:00<00:00, 17315.73it/s]
100%|██████████| 2008/2008 [00:00<00:00, 13529.08it/s]


# Final things

In [None]:
!cp /content/drive/MyDrive/DS565_Project/metadata.json data/metadata.json
!cp /content/drive/MyDrive/DS565_Project/label_mapping.json data/label_mapping.json
!cp /content/drive/MyDrive/DS565_Project/train.csv data/train.csv
!cp /content/drive/MyDrive/DS565_Project/val.csv data/val.csv
!cp /content/drive/MyDrive/DS565_Project/test.csv data/test.csv

# Sanity Check

Check if everything is okay before tar!

In [None]:
import json
from collections import deque

# check if there are any instances missing
modalities = ['text','audio','video']
splits = ['train','valid','test']

missing = {modal: [] for modal in modalities}
# check for each modality
for modal in modalities:
  metadata = json.load(open('data/metadata.json', 'r'))
  instances = deque(list(metadata.keys()))

  modal_files = []
  for split in splits:
    modal_files.extend([os.path.splitext(f)[0] for f in os.listdir(os.path.join('data', split, modal))])

  # check if any duplicate files, which means that same file added to train and test
  assert len(set(modal_files)) == len(modal_files)

  # check if there are any missing files not added
  for instance in list(instances):
    if instance in modal_files:
      instances.popleft()

  #assert len(instances) == 0
  missing[modal] = instances
print(missing)

{'text': deque([]), 'audio': deque([]), 'video': deque([])}


# Zipping!

In [None]:
!tar -czvf data.tar.gz data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
data/valid/text/Ses01F_impro06_F005.txt
data/valid/text/Ses05F_script01_3_M006.txt
data/valid/text/Ses04M_script01_1_F028.txt
data/valid/text/Ses05F_script02_2_M009.txt
data/valid/text/Ses01M_script01_3_M030.txt
data/valid/text/Ses01F_script02_2_M045.txt
data/valid/text/Ses03F_impro06_M013.txt
data/valid/text/Ses05F_impro07_F020.txt
data/valid/text/Ses05M_impro04_M001.txt
data/valid/text/Ses04M_script02_2_F017.txt
data/valid/text/Ses01M_script02_1_M019.txt
data/valid/text/Ses05M_script01_1b_M022.txt
data/valid/text/Ses04M_impro02_F011.txt
data/valid/text/Ses01M_script02_2_M025.txt
data/valid/text/Ses05M_script03_2_F012.txt
data/valid/text/Ses03M_script02_2_F032.txt
data/valid/text/Ses02F_impro06_M010.txt
data/valid/text/Ses04F_script01_3_F014.txt
data/valid/text/Ses01M_impro04_M021.txt
data/valid/text/Ses01M_impro07_M002.txt
data/valid/text/Ses03F_script01_1_M014.txt
data/valid/text/Ses04M_impro03_F011.txt
data/valid/text

In [None]:
import os
os.path.getsize('data.tar.gz') / 10e6

731.0699932

In [None]:
!mkdir /content/drive/MyDrive/DS565_Project/data
!cp data.tar.gz /content/drive/MyDrive/DS565_Project/data/dataset.tar.gz

# Reorganize data.tar.gz and split to audio, text, video
After compressing

In [None]:
!tar -xzvf /content/drive/MyDrive/DS565_Project/data/dataset.tar.gz

In [None]:
import os
import shutil
data_folder = 'data'
splits = ['train', 'valid', 'test']
modalities = ['audio', 'text', 'video']

for modal in modalities:
  #os.makedirs(modal)

  for split in splits:
    #os.makedirs(os.path.join(modal, split))
    src_path = os.path.join('data', split, modal, modal)
    dest_path = os.path.join(modal, split)
    shutil.move(src_path, dest_path)
    #shutil.move(dest_path, src_path)

In [None]:
for modal in modalities:
  shutil.copy('data/label_mapping.json', os.path.join(modal, 'label_mapping.json'))
  shutil.copy('data/metadata.json', os.path.join(modal, 'metadata.json'))
  shutil.copy('data/train.csv', os.path.join(modal, 'train.csv'))
  shutil.copy('data/val.csv', os.path.join(modal, 'val.csv'))
  shutil.copy('data/test.csv', os.path.join(modal, 'test.csv'))

In [None]:
!tar -czvf audio.tar.gz audio
!tar -czvf text.tar.gz text
!tar -czvf video.tar.gz video

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
audio/train/Ses04M_script01_3_F007.wav
audio/train/Ses02F_script01_2_M005.wav
audio/train/Ses04M_script01_1_M029.wav
audio/train/Ses01F_script03_2_F022.wav
audio/train/Ses04F_script03_2_F025.wav
audio/train/Ses03M_impro06_F001.wav
audio/train/Ses04M_impro06_M010.wav
audio/train/Ses01F_script02_2_M002.wav
audio/train/Ses04M_script02_2_F012.wav
audio/train/Ses05F_impro03_M031.wav
audio/train/Ses02M_script01_1_F026.wav
audio/train/Ses04F_impro03_M022.wav
audio/train/Ses04F_script02_2_F047.wav
audio/train/Ses02F_impro02_F010.wav
audio/train/Ses03M_impro03_F036.wav
audio/train/Ses02M_script01_1_F029.wav
audio/train/Ses05F_impro02_F030.wav
audio/train/Ses02M_impro03_F022.wav
audio/train/Ses02F_impro01_M008.wav
audio/train/Ses01M_impro04_F012.wav
audio/train/Ses05F_impro01_F001.wav
audio/train/Ses05F_impro07_F029.wav
audio/train/Ses04F_impro05_F006.wav
audio/train/Ses01F_script01_1_M012.wav
audio/train/Ses02F_impro07_F022.wav
au

In [None]:
!cp audio.tar.gz /content/drive/MyDrive/DS565_Project/data/audio.tar.gz
!cp text.tar.gz /content/drive/MyDrive/DS565_Project/data/text.tar.gz
!cp video.tar.gz /content/drive/MyDrive/DS565_Project/data/video.tar.gz