In [None]:
import json
import pandas as pd
import subprocess
import time

from pydub import AudioSegment
from tqdm import tqdm

WAV_OUTPUT_DIR = '../data/processed/wav/'

# Video data

In [None]:
df_videos = pd.read_csv('../data/processed/videos_with_segmented_audio.csv')
df_videos.drop_duplicates(subset=['id'], inplace=True)
df_videos.head()

# Checking what percentage of the video is made of music

In [None]:
audio_segments_df = pd.read_csv('../data/processed/video_audio_segments.csv')
audio_segments_df['total_duration'] = audio_segments_df['end'] - audio_segments_df['start']

# Grouping by video_id and label
grouped_audio_segments = audio_segments_df.groupby(['video_id', 'label']).agg({'total_duration': 'sum'})
grouped_audio_segments = grouped_audio_segments.reset_index()

videos_with_music = grouped_audio_segments.loc[grouped_audio_segments['label'] == 'music', ['video_id', 'total_duration']]
videos_with_music.columns = ['video_id', 'total_music_duration']
videos_with_music.head()

## Merging dataframe with audio segment

In [None]:
df_merged_videos_annotations = df_videos.merge(videos_with_music, left_on='id', right_on='video_id', how='left')
df_merged_videos_annotations = df_merged_videos_annotations.drop(['video_id'], axis=1)

# Calculating percentage of music in video
df_merged_videos_annotations['total_music_duration'] = df_merged_videos_annotations['total_music_duration'].fillna(0)
df_merged_videos_annotations['percentage_of_video_made_of_music'] = df_merged_videos_annotations['total_music_duration'] * 100.00 / df_merged_videos_annotations['duration']
df_merged_videos_annotations.set_index('id', inplace=True)
df_merged_videos_annotations.head()

# Trimming audio containing only male and female voice

In [None]:
successful_audios_with_voice = 0
for video_id in tqdm(df_merged_videos_annotations.index):
    video = df_merged_videos_annotations.loc[[video_id]]
    video_id = video_id

    video_filename = '../' + video['video_path'].values[0]

    audio_voice_segments = audio_segments_df.loc[(audio_segments_df['video_id'] == video_id) & 
                                                 (audio_segments_df['label'].isin(['male', 'female']))].copy()
    audio_voice_segments.loc[:, 'next_start'] = audio_voice_segments.loc[:, 'start'].shift(-1)

    audio = AudioSegment.from_file(video_filename)
    trimmed_audio_voices = []

    audio_start = audio_voice_segments['start'].min() * 1000
    for idx, row in audio_voice_segments.iterrows():
        if row['next_start'] < row['end'] + 1:
            continue

        start = audio_start        
        end = (row['end'] + 1) * 1000
        if len(trimmed_audio_voices) == 0:
            trimmed_audio_voices = audio[start:end]
        else:
            trimmed_audio_voices = trimmed_audio_voices + audio[start:end]

        audio_start = row['next_start'] * 1000

    if len(trimmed_audio_voices) > 0:
        trimmed_audio_voices.export(f"../data/audios/{video_id}.mp3", format="mp3")
        successful_audios_with_voice += 1

# Saving transcripted video

In [None]:
df_merged_videos_annotations.to_csv('../data/processed/transcripted_processed_videos.csv')