# Large_Scale_Filtering.ipynb
This notebook performs:
* Basic filtering on the ~100k podcasts.
* Adds basic features to the df. 
* Writes in the ascii WhisperX transcripts into the main df. 

In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm
import numpy as np

import utils_general

def get_ascii_text(dictionary):
    text = ""
    
    # read text from json object
    for t in dictionary["segments"]:
        text += t["text"]
    text = text.strip()
    
    # only allow ascii characters
    text = text.encode("ascii", "ignore").decode()
        
    return text

# create df based off of metadata df
df = pd.read_csv(utils_general.PATH_TO_TRAIN_DF, sep="\t")

# filter df to only include podcasts at least 10 minutes long
df = df[df["duration"] >= 10.0]

# recreate language column (setup for languages based on whisperx, not on existing language id)
df = df.drop("language", axis=1)
df["language"] = ""

# create column for whisperxtranscripts
df["transcript"] = ""

# iterate through json files to add to df
pbar = tqdm(total=105360)
for root, dirs, files in os.walk("/data1/maria/Spotify-Podcasts/train-10min-whisperx-dir"):
    if files:
        for file in files:
            if file == "transcript.json":
                
                full_filepath = os.path.join(root,"transcript.json")
                episode_id = os.path.split(root)[-1]
                with open(full_filepath) as f:
                    dictionary = json.loads(f.read())

                df.loc[df["episode_filename_prefix"] == episode_id, "language"] = dictionary["language"]
                df.loc[df["episode_filename_prefix"] == episode_id, "transcript"] = get_ascii_text(dictionary)
                
                pbar.update(1)

# only allow english transcripts
df = df[df["language"] == "en"]

# calculate number of words in transcripts
df["transcript_length"] = -1
df["transcript"] = df["transcript"].fillna("")
for index, row in df.iterrows():
    # handle special case of empty string
    if row["transcript"] != "":
        num_words = len(row["transcript"].split(" "))
    else:
        num_words = 0
    # write in the number of words
    df.loc[index, "transcript_length"] = num_words

# save dfs with zero words for examination
zero_words_df = df[df["transcript_length"] == 0]
zero_words_df.to_csv("./csv/zero_words_df.csv", header=True)

# filter out transcripts with zero words
df = df[df["transcript_length"] != 0]

# save dfs with less than 10 words for examination
less_than_10_words_df = df[df["transcript_length"] < 10]
less_than_10_words_df.to_csv("./csv/less_than_10_words_df.csv", header=True)

# filter out transcripts with less than 10 words
df = df[df["transcript_length"] > 10]

# save df
df.to_csv("./csv/df.csv", header=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105360/105360 [13:30<00:00, 128.00it/s]