In [31]:
import os
import pandas as pd

# Convert video data to audio data
from moviepy.editor import VideoFileClip

# Process audio data (segment into fragments)
from pydub import AudioSegment

# Transcription from Dutch audio to English text
import whisper

## Reading the data and separating the dataframe by episodes

In [32]:
df_ep = pd.read_csv("client_data/Expiditie_Robinson_Data/Robinson22_structure.csv")

# Keep only the specified columns
columns_to_keep = ['Episode name', 'Act', 'Chapter', 'Segment', 'Start Time (seconds)', 'End Time (seconds)']
df_filtered = df_ep[columns_to_keep]

# Drop duplicate rows in the dataframe
df_filtered = df_filtered.drop_duplicates()

df_filtered

Unnamed: 0,Episode name,Act,Chapter,Segment,Start Time (seconds),End Time (seconds)
0,1,Aparte eilanden,Introductie,Uitleg programma,0,124
67,1,Aparte eilanden,Reality,Aankomst,124,350
132,1,Aparte eilanden,Introductie,Introductie expeditieleden,350,383
194,1,Aparte eilanden,Introductie,Introductie expeditieleden,383,443
255,1,Aparte eilanden,Introductie,Introductie expeditieleden,443,504
...,...,...,...,...,...,...
17624,17,Finale,Idents,Up next,2818,2838
17626,17,Finale,Finale,Interview,2838,3065
17631,17,Finale,Finale,Uitvoering finaleproef,3065,3603
17641,17,Finale,Finale,Uitslag finaleproef,3603,3678


In [33]:
# Find unique episode names to create separate dataframes for each
unique_episodes = df_filtered['Episode name'].unique()

# Dictionary to hold each episode dataframe
episode_dfs = {}

for episode in unique_episodes:
    # Filter dataframe for each unique episode and store it in the dictionary
    episode_dfs[f'df_ep_{episode}'] = df_filtered[df_filtered['Episode name'] == episode]

In [34]:
# Access a specific episode dataframe with episode_dfs['df_ep_1'], episode_dfs['df_ep_2'], etc.
episode_dfs['df_ep_1'] = episode_dfs['df_ep_1'].reset_index(drop=True)

## Fixing the problem with End Time column (present in episodes 2-13)

In [35]:
# Loop through episode numbers 2 to 13
for episode_num in range(2, 14):  # This will include episodes 2 to 13
    df = episode_dfs[f'df_ep_{episode_num}']  # Access the dataframe for the current episode
    
    # Ensure the dataframe index is reset to avoid KeyErrors
    df.reset_index(drop=True, inplace=True)
    
    # Loop through the dataframe rows except for the last one
    for i in range(len(df) - 1):
        # Update the end time of the current row to the start time of the next row using iloc
        df.iloc[i, df.columns.get_loc('End Time (seconds)')] = df.iloc[i + 1, df.columns.get_loc('Start Time (seconds)')]
    
    # Update the episode dataframe in episode_dfs dictionary
    episode_dfs[f'df_ep_{episode_num}'] = df

    print(f"Corrected df_ep_{episode_num}")


Corrected df_ep_2
Corrected df_ep_3
Corrected df_ep_4
Corrected df_ep_5
Corrected df_ep_6
Corrected df_ep_7
Corrected df_ep_8
Corrected df_ep_9
Corrected df_ep_10
Corrected df_ep_11
Corrected df_ep_12
Corrected df_ep_13


## Converting the .mov files into .mp3 files

In [26]:
# Define the source and destination directories
source_dir = 'client_data/Expiditie_Robinson_Episodes'
dest_dir = 'client_data/Expiditie_Robinson_Episodes_audio_mp3'

# Make sure the destination folder exists, if not, create it
os.makedirs(dest_dir, exist_ok=True)

In [15]:
# Loop through each .mov file in the source directory
for episode_number in range(1, 18):  # Assuming episodes 1 through 17
    source_filename = f'ER22_AFL{episode_number:02d}_MXF.mov'
    source_path = os.path.join(source_dir, source_filename)
    
    # Define the destination filename and path
    dest_filename = f'ER22_{episode_number}.mp3'
    dest_path = os.path.join(dest_dir, dest_filename)
    
    # Load the video file
    video = VideoFileClip(source_path)
    
    # Extract and save the audio as .wav
    video.audio.write_audiofile(dest_path)  # codec='pcm_s16le' specifies 16-bit WAV format

    print(f'Converted and saved: {dest_filename}')

print('All files have been converted.')

MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_1.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_1.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_2.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_2.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_3.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_3.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_4.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_4.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_5.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_5.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_6.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_6.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_7.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_7.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_8.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_8.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_9.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_9.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_10.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_10.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_11.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_11.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_12.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_12.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_13.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_13.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_14.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_14.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_15.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_15.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_16.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_16.mp3
MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_mp3\ER22_17.mp3


                                                                        

MoviePy - Done.
Converted and saved: ER22_17.mp3
All files have been converted.




## Converting the .mov files into .wav files

In [None]:
# Define the source and destination directories
source_dir = 'client_data/Expiditie_Robinson_Episodes'
dest_dir = 'client_data/Expiditie_Robinson_Episodes_audio'

# Make sure the destination folder exists, if not, create it
os.makedirs(dest_dir, exist_ok=True)

In [30]:
# Loop through each .mov file in the source directory
for episode_number in range(17, 18):  # Assuming episodes 1 through 17
    source_filename = f'ER22_AFL{episode_number:02d}_MXF.mov'
    source_path = os.path.join(source_dir, source_filename)
    
    # Define the destination filename and path
    dest_filename = f'ER22_{episode_number}.wav'
    dest_path = os.path.join(dest_dir, dest_filename)
    
    # Load the video file
    video = VideoFileClip(source_path)
    
    # Extract and save the audio as .wav
    video.audio.write_audiofile(dest_path, codec='pcm_s16le')  # codec='pcm_s16le' specifies 16-bit WAV format

    print(f'Converted and saved: {dest_filename}')

print('All files have been converted.')

MoviePy - Writing audio in client_data/Expiditie_Robinson_Episodes_audio_fragments\ER22_17.wav


                                                                        

MoviePy - Done.
Converted and saved: ER22_17.wav
All files have been converted.




## Separating the audio into fragments (mp3)

In [36]:
def fragment_episode_audio_mp3(episode_number):
    # Assuming episode_dfs is a dictionary containing dataframes for each episode
    df = episode_dfs.get(f'df_ep_{episode_number}')

    if df is None:
        print(f"No data found for episode {episode_number}.")
        return

    # Reset the index of the dataframe
    df = df.reset_index(drop=True)

    # Define the path to the source audio file based on the episode number
    source_audio_path = f'client_data/Expiditie_Robinson_Episodes_audio_mp3/ER22_{episode_number}.mp3'

    # Load the entire episode audio file
    episode_audio = AudioSegment.from_mp3(source_audio_path)

    # Define the destination directory for audio fragments
    dest_dir = 'client_data/Expiditie_Robinson_Episodes_audio_fragments_mp3'

    # Create the destination directory if it doesn't already exist
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Loop through each row in the dataframe to extract and save fragments
    for index, row in df.iterrows():
        # Calculate start and end times in milliseconds
        start_time = row['Start Time (seconds)'] * 1000
        end_time = row['End Time (seconds)'] * 1000

        # Extract the fragment
        fragment = episode_audio[start_time:end_time]

        # Define the destination filename
        dest_filename = f"ER22_ep{episode_number}_fra_{index + 1}.mp3"
        dest_path = os.path.join(dest_dir, dest_filename)

        # Save the fragment
        fragment.export(dest_path, format="mp3")

        print(f'Saved: {dest_filename}')

    print('\nAll fragments have been processed and saved.')


In [29]:

def fragment_episode_audio_mp3(episode_number):
    # Assuming episode_dfs is a dictionary containing dataframes for each episode
    df = episode_dfs.get(f'df_ep_{episode_number}')

    if df is None:
        print(f"No data found for episode {episode_number}.")
        return

    # Reset the index of the dataframe
    df = df.reset_index(drop=True)

    # Define the path to the source audio file based on the episode number
    source_audio_path = f'client_data/Expiditie_Robinson_Episodes_audio_mp3/ER22_{episode_number}.mp3'

    # Load the entire episode audio file
    episode_audio = AudioSegment.from_mp3(source_audio_path)

    # Define the destination directory for audio fragments
    dest_dir = 'client_data/Expiditie_Robinson_Episodes_audio_fragments_mp3'

    # Create the destination directory if it doesn't already exist
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Loop through each row in the dataframe to extract and save fragments
    for index, row in df.iterrows():
        # Calculate start and end times in milliseconds
        start_time = row['Start Time (seconds)'] * 1000
        end_time = row['End Time (seconds)'] * 1000

        # Extract the fragment
        fragment = episode_audio[start_time:end_time]

        # Define the destination filename
        dest_filename = f"ER22_ep{episode_number}_fra_{index + 1}.mp3"
        dest_path = os.path.join(dest_dir, dest_filename)

        # Save the fragment
        fragment.export(dest_path, format="mp3")

        print(f'Saved: {dest_filename}')

    print('\nAll fragments have been processed and saved.')


In [37]:
for ep in range(1, 18):
    fragment_episode_audio(ep)

Saved: ER22_ep10_fra_1.mp3
Saved: ER22_ep10_fra_2.mp3
Saved: ER22_ep10_fra_3.mp3
Saved: ER22_ep10_fra_4.mp3
Saved: ER22_ep10_fra_5.mp3
Saved: ER22_ep10_fra_6.mp3
Saved: ER22_ep10_fra_7.mp3
Saved: ER22_ep10_fra_8.mp3
Saved: ER22_ep10_fra_9.mp3
Saved: ER22_ep10_fra_10.mp3
Saved: ER22_ep10_fra_11.mp3
Saved: ER22_ep10_fra_12.mp3
Saved: ER22_ep10_fra_13.mp3
Saved: ER22_ep10_fra_14.mp3
Saved: ER22_ep10_fra_15.mp3
Saved: ER22_ep10_fra_16.mp3
Saved: ER22_ep10_fra_17.mp3
Saved: ER22_ep10_fra_18.mp3
Saved: ER22_ep10_fra_19.mp3
Saved: ER22_ep10_fra_20.mp3
Saved: ER22_ep10_fra_21.mp3
Saved: ER22_ep10_fra_22.mp3
Saved: ER22_ep10_fra_23.mp3
Saved: ER22_ep10_fra_24.mp3
Saved: ER22_ep10_fra_25.mp3
Saved: ER22_ep10_fra_26.mp3
Saved: ER22_ep10_fra_27.mp3
Saved: ER22_ep10_fra_28.mp3
Saved: ER22_ep10_fra_29.mp3
Saved: ER22_ep10_fra_30.mp3

All fragments have been processed and saved.
Saved: ER22_ep11_fra_1.mp3
Saved: ER22_ep11_fra_2.mp3
Saved: ER22_ep11_fra_3.mp3
Saved: ER22_ep11_fra_4.mp3
Saved: ER22_ep1

## Separating the audio into fragments (wav)

In [None]:
# Function to process and fragment an episode's audio
def fragment_episode_audio(episode_number):
    # Assuming episode_dfs is a dictionary containing dataframes for each episode
    df = episode_dfs[f'df_ep_{episode_number}']

    # Reset the index of the dataframe
    df = df.reset_index(drop=True)

    # Define the path to the source audio file based on the episode number
    source_audio_path = f'client_data/Expiditie_Robinson_Episodes_audio/ER22_{episode_number}.wav'

    # Load the entire episode audio file
    episode_audio = AudioSegment.from_wav(source_audio_path)

    # Define the destination directory for audio fragments
    dest_dir = 'client_data/Expiditie_Robinson_Episodes_audio_fragments'

    # Create the destination directory if it doesn't already exist
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Loop through each row in the dataframe to extract and save fragments
    for index, row in df.iterrows():
        # Calculate start and end times in milliseconds
        start_time = row['Start Time (seconds)'] * 1000
        end_time = row['End Time (seconds)'] * 1000
        
        # Extract the fragment
        fragment = episode_audio[start_time:end_time]
        
        # Define the destination filename
        dest_filename = f"ER22_ep{episode_number}_fra_{index + 1}.wav"
        dest_path = os.path.join(dest_dir, dest_filename)
        
        # Save the fragment
        fragment.export(dest_path, format="wav")
        
        print(f'Saved: {dest_filename}')

    print('\nAll fragments have been processed and saved.')