# Large_Scale_Transcripts.ipynb
This notebook:
* Combines the WhisperX and Google ASR transcripts into a single csv.

In [1]:
import utils_general

import os
import difflib
import pandas as pd
from tqdm import tqdm
import json
import string
from urlextract import URLExtract
import numpy as np

In [2]:
# Google ASR functions

def clean_urls(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    for url in urls:
        text = text.replace(url, "")
    return text

# modified from https://github.com/potsawee/podcast_trec2020/blob/main/data/processor.py
# also performs some basic cleaning
def get_transcript_text_from_json_asr_file(json_asr_file):
    transcript_list = []
    with open(json_asr_file) as f:
        transcript_dict = json.loads(f.read())
        
        results_list = [r for r in transcript_dict["results"]]
        last_result = results_list[-1]
        
        for word_dict in last_result["alternatives"][0]["words"]:
            endTime = float(word_dict["endTime"].replace("s",""))
            if endTime <= (60 * 10):  # truncates the podcasts to 10 minutes
                transcript_list.append(word_dict["word"])
        
        transcript_string = " ".join(transcript_list)
        
        # clean the transcripts the same way as the descriptions
        transcript_string = clean_urls(transcript_string)
        transcript_string = transcript_string.encode("ascii", "ignore").decode()
        
        if transcript_string != "" and transcript_string[-1] not in string.punctuation:
            transcript_string += "."
        
        return transcript_string

# WhisperX functions
def get_ascii_text(dictionary):
    text = ""
    
    # read text from json object
    for t in dictionary["segments"]:
        text += t["text"]
    text = text.strip()
    
    # only allow ascii characters
    text = text.encode("ascii", "ignore").decode()
        
    return text

df = pd.read_csv("./csv/df.csv")
df = df.rename(columns={"transcript": "WhisperX"})

# for the files
pbar = tqdm(total=len(df))
for index, row in df.iterrows():
    
    # read in the GoogleASR transcriptions
    show_filename_prefix = row["show_filename_prefix"]
    episode_filename_prefix = row["episode_filename_prefix"]
    dir1 = show_filename_prefix.split("_")[1][0].upper()
    dir2 = show_filename_prefix.split("_")[1][1].upper()
    json_asr_file = os.path.join("/data2/maria/Spotify-Podcasts/podcasts-no-audio-13GB/spotify-podcasts-2020/podcasts-transcripts", dir1, dir2, show_filename_prefix, episode_filename_prefix+".json")

    df.loc[index, "GoogleASR"] = get_transcript_text_from_json_asr_file(json_asr_file)
    
    # update progress bar
    pbar.update(1)

display(df)

df.to_csv("./csv/large_scale_texts.csv", header=True)

def split_dataframe(big_df, n_parts):

    # Split the indices into n_parts parts
    indices = np.array(big_df.index)
    parts_indices = np.array_split(indices, n_parts)

    # Iterate through the parts and create DataFrames
    split_dfs = []
    for part_indices in parts_indices:
        part_df = big_df.loc[part_indices]
        split_dfs.append(part_df)

    return split_dfs


big_df = pd.read_csv("./csv/large_scale_texts.csv")

n_parts = 2
split_dfs = split_dataframe(big_df, n_parts)

lens = []
for index, df in enumerate(split_dfs):
    lens.append(len(df))
    csv_path = f"./csv/large_scale_texts-{index}.csv"
    utils_general.delete_file_if_already_exists(csv_path)
    df.to_csv(csv_path, header=True)
    
assert sum(lens) == len(big_df)