# Feature extraction

## Libs

In [None]:
import pandas as pd

from tqdm import tqdm
from pathlib import Path

from mbp.preprocessing.loader import Loader
from mbp.feature_extraction.open_face import OpenFaceFeatureExtractor
from mbp.feature_extraction.open_smile import OpenSmileFeatureExtractor

from scripts.data_objects import SIT_PARTS, SITVersion
from scripts.datasets import sit_datasets
from scripts.utils import extract_id_clip

## OpenFace

In [None]:
def extract_open_face_features(dataset):
    print(f'Extracting dataset from: {dataset.corrected}...')
    loader = Loader(dataset.corrected, dataset.video_extension)
    data = loader.load()
    output_path = Path(dataset.processed) / 'open_face_features'
    output_path.mkdir(exist_ok=True, parents=True)

    extractor = OpenFaceFeatureExtractor(output_path,
                                         correct_timespan=True,
                                         aus=True,
                                         au_static=True,
                                         pose=True,
                                         tracked=False,
                                         gaze=True)

    for video_path in tqdm(data):
        extractor.run(video_path)

In [None]:
for dataset in sit_datasets.values():
    extract_open_face_features(dataset)

## OpenSmile

In [None]:
def extract_features_for_participant(data: pd.DataFrame,
                                     feature_extractor,
                                     dataset_version: type[SITVersion],
                                     participant_id: str) -> pd.DataFrame:
    all_features = []

    participant_videos = (data[data['id'] == participant_id]
                          .sort_values(by='path')
                          .iterrows())

    participant_videos = participant_videos if (
            dataset_version == SITVersion.ONLINE
    ) else [list(participant_videos)[-1]]  # analyze only last video

    for index, row in participant_videos:
        video_clip = row['clip'] if (
                dataset_version == SITVersion.ONLINE
        ) else '2'  # if desktop version, only second part is analysed
        parts = SIT_PARTS[dataset_version][video_clip]

        for part in parts:
            if part.speaker == 'actress':
                # Skip processing when the actress is speaking
                continue
            filepath = feature_extractor.run(row['path'],
                                             start=part.start,
                                             end=part.end)
            features = pd.read_csv(filepath)
            filepath.unlink()
            features['part'] = part.name
            features['id'] = participant_id
            all_features.append(features)

    return pd.concat(all_features).reset_index()


def extract_features(data, output_path, dataset):
    feature_extractor = OpenSmileFeatureExtractor(output_path)

    data[['id', 'clip']] = data.T.apply(lambda row:
                                        extract_id_clip(row['path'].stem)).T

    for participant_id in tqdm(data['id'].unique()):
        if participant_id in dataset.blacklist:
            continue
        output_filepath = output_path / f'{participant_id}.csv'
        if output_filepath.is_file():
            continue
        try:
            features = extract_features_for_participant(data,
                                                        feature_extractor,
                                                        dataset.SIT_version,
                                                        participant_id)
            features.to_csv(output_filepath, index=False)
        except Exception as e:
            print(f'Error processing video: {participant_id}')
            print(f'Error message: {e}')


In [None]:
for dataset in sit_datasets.values():
    print(f'Extracting features for: {dataset.corrected}...')
    loader = Loader(dataset.corrected, dataset.video_extension)
    data = loader.load()
    data = pd.DataFrame(data, columns=['path'])

    output_path = Path(dataset.processed) / 'open_smile_features'
    output_path.mkdir(exist_ok=True, parents=True)

    extract_features(data, output_path, dataset)