# MUSE

## Imports

In [1]:
import tarfile
import os
from dataset import MELDDataset

## Data

In [2]:
# extract the data from a ".tar.gz" file. Put the content into data/name_of_file folder
def extract_data(file_name, data_folder):
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    tar = tarfile.open(file_name, "r:gz")
    tar.extractall(data_folder)
    tar.close()

In [3]:
def rename_files(data_folder):
    for root, dirs, files in os.walk(data_folder):
        # for all videos in the folder, remove "._" at the beginning of the file name
        for file in files:
            if file.startswith('._'):
                os.rename(os.path.join(root, file), os.path.join(root, file[2:]))
            
rename_files('./data/test/video')

In [4]:
# extract_data("data/raw/test.tar.gz", "data/test")

## Dataset

In [27]:
import subprocess
import os
import subprocess
from pathlib import Path
import pandas as pd


def count_valid_corrupted_videos(folder_path, extensions=("mp4", "mov", "mkv", "avi")):
    """
    Filters out corrupted video files in a specified folder.

    Args:
        folder_path (str): Path to the folder containing video files.
        extensions (tuple): Allowed video file extensions to check (default: common formats).

    Returns:
        valid_videos (list): List of valid video file paths.
        corrupted_videos (list): List of corrupted or invalid video file paths.
    """
    valid_videos = []
    corrupted_videos = []

    # Get all video files in the folder with the specified extensions
    video_files = [file for ext in extensions for file in Path(
        folder_path).rglob(f"*.{ext}")]

    for video_path in video_files:
        # Convert Path object to string (for subprocess compatibility)
        video_path = str(video_path)

        # Run ffprobe to check the video file
        try:
            command = [
                "ffprobe", "-v", "error", "-show_entries", "format=duration",
                "-of", "default=noprint_wrappers=1:nokey=1", video_path
            ]
            result = subprocess.run(
                command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

            if result.returncode == 0:
                # If ffprobe runs successfully and returns a duration, the video is valid
                duration = result.stdout.strip()
                if duration:  # If duration is not empty
                    valid_videos.append(video_path)
                else:
                    corrupted_videos.append(video_path)
            else:
                # If ffprobe fails, the video is considered corrupted
                print(f"Corrupted video detected: {video_path}")
                corrupted_videos.append(video_path)

        except Exception as e:
            # Catch unexpected errors
            print(f"Error processing file {video_path}: {e}")
            corrupted_videos.append(video_path)

    return valid_videos, corrupted_videos

def delete_corrupted_videos(videos_paths, csv_path):
    """
    Deletes corrupted video files from the specified folder.

    Args:
        folder_path (str): Path to the folder containing video files.
        videos_paths (list): List of corrupted video file paths.
    """
    df = pd.read_csv(csv_path)
    for video_path in videos_paths:
        try:
            os.remove(video_path)
            print(f"Deleted corrupted video: {video_path}")
        except Exception as e:
            print(f"Error deleting file {video_path}: {e}")
            
        # Remove the corresponding row from the dataframe
        # by first extracting dialogue and utterance IDs from the file path
        # (naming: dia{dialogue_id}_utt{utterance_id}.mp4)
        dialogue_id = int(video_path.split("dia")[1].split("_")[0])
        utterance_id = int(video_path.split("utt")[1].split(".")[0])
        df = df.drop(df[(df["Dialogue_ID"] == dialogue_id) & (df["Utterance_ID"] == utterance_id)].index)
        
    # Save the updated dataframe
    df.to_csv(csv_path, index=False)

def rename_test_videos(data_folder):
    # if a video has the prefix "final_videos_test" remove it
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            if file.startswith('t'):
                os.rename(os.path.join(root, file), os.path.join(root, file[1:]))

In [37]:
valid_videos, corrupted_videos = count_valid_corrupted_videos(
    'data/test/video')
print(f"Valid videos: {len(valid_videos)}")
print(f"Corrupted videos: {len(corrupted_videos)}")

Valid videos: 554
Corrupted videos: 0


In [38]:
# filter the corrupted videos
delete_corrupted_videos(corrupted_videos, 'data/test/test_sent_emo.csv')

In [39]:
# check the df have the same number of lines as the number of valid videos
df = pd.read_csv('data/test/test_sent_emo.csv')
print(f"Number of lines in the csv file: {len(df)}")
print(f"Number of valid videos: {len(valid_videos)}")

# check which one are missing
for video_path in valid_videos:
    # Extract dialogue and utterance IDs from the file path
    dialogue_id = int(video_path.split("dia")[1].split("_")[0])
    utterance_id = int(video_path.split("utt")[1].split(".")[0])

    # Check if the dialogue and utterance IDs are in the dataframe
    if not ((df["Dialogue_ID"] == dialogue_id) & (df["Utterance_ID"] == utterance_id)).any():
        print(f"Missing: {video_path}")
        

Number of lines in the csv file: 554
Number of valid videos: 554


In [42]:
# remove the videos that are not in the csv file
for video_path in valid_videos:
    # Extract dialogue and utterance IDs from the file path
    dialogue_id = int(video_path.split("dia")[1].split("_")[0])
    utterance_id = int(video_path.split("utt")[1].split(".")[0])

    # Check if the dialogue and utterance IDs are in the dataframe
    if not ((df["Dialogue_ID"] == dialogue_id) & (df["Utterance_ID"] == utterance_id)).any():
        os.remove(video_path)
        print(f"Deleted: {video_path}")

In [44]:
# setup the dataset
dataset = MELDDataset(
    csv_file="data/test/test_sent_emo.csv",
    root_dir="data/test",
)

In [None]:
example = dataset[0]
example

{'audio_array': array([-0.00198962, -0.02142128, -0.02587057, ..., -0.06124197,
        -0.06868309, -0.0437346 ], dtype=float32),
 'sampling_rate': 22050,
 'transcript': 'also I was the point person on my company\x92s transition from the KL-5 to GR-6 system.',
 'emotion': 'neutral',
 'sentiment': 'neutral'}

In [None]:
from model import *

emotions = dataset.get_emotions_list()
model = MultimodalClassifier(num_classes=len(emotions))

In [20]:
set(dataset.get_emotions_list())

{'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'}

In [21]:
set(dataset.get_sentiments_list())

{'negative', 'neutral', 'positive'}