In [None]:
import pandas as pd

from tqdm import tqdm

# Audio segmentation library
from inaSpeechSegmenter import Segmenter
from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid

In [None]:
BASE_PATH = '../data/videos/'

In [None]:
def segment_audio(video_id):
    # Path of the media
    media = BASE_PATH + f'{video_id}.mp4'

    # starting audio segmentation
    segmentation = seg(media)
    
    return segmentation

# Data

In [None]:
df_videos = pd.read_csv('../data/processed/parsed_dataset.csv', delimiter=';')
df_videos.drop_duplicates(subset=['id'], inplace=True)
df_videos.head()

# Audio segmentation

In [None]:
# create an instance of speech segmenter
# this loads neural networks and may last few seconds
# Warnings have no incidence on the results
seg = Segmenter()

# Parsing videos to extract audio segmentation

In [None]:
videos_segmentation_parsed = []

In [None]:
for video in tqdm(df_videos.loc[~df_videos['id'].isin(df_videos_segmentation_parsed['video_id']), 'id']):
    try:
        segmentation = segment_audio(video)
    except:
        continue
    
    video_segmentations = {
        'video_id': video, 
        'video_contains_music': False, 
        'video_contains_male': False,
        'video_contains_female': False,
        'video_contains_noise': False,
        'video_contains_no_energy': False,
        'segments': []}
    
    for label, start, end, in segmentation:
        segmentation_dict = {}
        
        if label == 'music':
            video_segmentations['video_contains_music'] = True
        if label == 'male':
            video_segmentations['video_contains_male'] = True
        if label == 'female':
            video_segmentations['video_contains_female'] = True
        if label == 'noEnergy':
            video_segmentations['video_contains_no_energy'] = True
        if label == 'noise':
            video_segmentations['video_contains_noise'] = True
            
        segmentation_dict['label'] = label
        segmentation_dict['start'] = start
        segmentation_dict['end'] = end
        
        video_segmentations['segments'].append(segmentation_dict)
    
    videos_segmentation_parsed.append(video_segmentations)

In [None]:
df_videos_segmentation_parsed = pd.DataFrame.from_dict(videos_segmentation_parsed)
df_videos_segmentation_parsed.to_csv('../data/processed/audio_segmentation_dataset.csv', index=False)

# Audio segment dataframe

In [None]:
audio_segments_list = []
for video in videos_segmentation_parsed:
    for segment in video['segments']:
        video_segments_dict = {
            'video_id': video['video_id'], 
            'label': segment['label'],
            'start': segment['start'],
            'end': segment['end']}
        
        audio_segments_list.append(video_segments_dict)
        
audio_segments_df = pd.DataFrame.from_dict(audio_segments_list)
# Saving audio segments to CSV
audio_segments_df.to_csv('../data/processed/video_audio_segments.csv', index=False)

# Saving dataset

In [None]:
df_segmented_videos = df_videos.merge(df_videos_segmentation_parsed, left_on='id', right_on='video_id', how='left')

In [None]:
df_segmented_videos = df_segmented_videos.dropna(subset=['video_id'])
df_segmented_videos = df_segmented_videos.drop(['video_id'], axis=1)

In [None]:
df_segmented_videos.to_csv('../data/processed/videos_with_segmented_audio.csv', index=False)