In [1]:
import os
import pandas as pd
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_audio, ffmpeg_extract_subclip
from moviepy.config import get_setting
from moviepy.tools import subprocess_call
import glob
import shutil
import whisper
import tqdm
import csv

# Preprocessing the Expeditie Robinson data

In [None]:
structure = pd.read_csv("data_robinson/Robinson22_structure.csv")
structure = structure.fillna("neutral")
# Select only the desired columns
structure_subset = structure[['Episode name', 'Start Time (seconds)', 'End Time (seconds)', 'Emotions']]
structure_subset = structure_subset.drop_duplicates()
structure_subset

In [4]:
grouped_episodes = structure_subset.groupby('Episode name')

In [None]:
# Define a function to correct the end times within each episode
def correct_end_times(df):
    df = df.sort_values(by='Start Time (seconds)')  # Sort by start time to ensure correctness
    df['End Time (seconds)'] = df['Start Time (seconds)'].shift(-1).fillna(df['End Time (seconds)'].iloc[-1])
    return df

# Apply the function to each group and update the DataFrame
corrected_structure_subset = grouped_episodes.apply(correct_end_times)

# Reset index to make the DataFrame flat
corrected_structure_subset.reset_index(drop=True, inplace=True)
# Rename the episode name column to episode
corrected_structure_subset.rename(columns={'Episode name': 'Episode'}, inplace=True)
corrected_structure_subset.rename(columns={'Start Time (seconds)': 'Start_time'}, inplace=True)
corrected_structure_subset.rename(columns={'End Time (seconds)': 'End_time'}, inplace=True)
# Print the corrected DataFrame
corrected_structure_subset

In [6]:
# Save new filtered dataframe to CSV
corrected_structure_subset.to_csv('data_robinson/Robinson_filtered.csv', index=False)
print("DataFrame has been saved as Robinson_filtered.csv")

DataFrame has been saved as Robinson_filtered.csv


# Splitting the episodes into fragments

In [12]:
os.mkdir("./temp_clips")

for file_path in glob.glob("episodes/full_episodes/*"):
    episode_number = int(os.path.basename(file_path)[8:10])
    temp_df = corrected_structure_subset[corrected_structure_subset['Episode'] == episode_number]
    temp_df = temp_df[['Start_time', 'End_time']]

    for row_index in range(temp_df.shape[0]):
        start = temp_df.iloc[row_index, temp_df.columns.get_loc('Start_time')]
        end = temp_df.iloc[row_index, temp_df.columns.get_loc('End_time')]

        clip_path = f"temp_clips/ep_{episode_number}_{start}_{end}.mov"
        final_path = f"episodes/episodes_cut/{os.path.basename(clip_path[:-4])}.mp3"


        ffmpeg_extract_subclip(file_path, start, end, clip_path)
        ffmpeg_extract_audio(clip_path, final_path)

shutil.rmtree("./temp_clips")

Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join

# Speach to text

In [None]:
data_filtered = pd.read_csv("data_robinson/Robinson_filtered.csv")
data_filtered.head(100)

In [37]:
transcript = []

for row_index in range(len(data_filtered)):
    
    episode_number = data_filtered.iloc[row_index, 0]
    start = data_filtered.iloc[row_index, 1]
    end = data_filtered.iloc[row_index, 2]
    
    clip_path = f"episodes/episodes_cut/ep_{episode_number}_{start}_{end}.mp3"

    # Load the Whisper model and transcribe the audio file
    model = whisper.load_model("base")
    result = model.transcribe(clip_path, fp16=False, language="en")
    #print(result["text"])
    transcript.append(result["text"])

In [38]:
# Save transcript to CSV
with open('./data_robinson/transcript.csv', 'w', newline='') as csvfile:
    fieldnames = ['Episode Number', 'Start', 'End', 'Transcript']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for i in range(len(data_filtered)):
        writer.writerow({'Episode Number': data_filtered.iloc[i, 0],
                         'Start': data_filtered.iloc[i, 1],
                         'End': data_filtered.iloc[i, 2],
                         'Transcript': transcript[i]})