# get_whisperx_googleasr_table.ipynb

This notebook:
* Gets the comparison of WhisperX and Google ASR.

In [1]:
import pandas as pd
import os
import json
import string
import re

from urlextract import URLExtract

import utils_general

def clean_urls(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    for url in urls:
        text = text.replace(url, "")
    return text

# modified from https://github.com/potsawee/podcast_trec2020/blob/main/data/processor.py
# also performs some basic cleaning
def get_transcript_text_from_json_asr_file(json_asr_file):
    transcript_list = []
    with open(json_asr_file) as f:
        transcript_dict = json.loads(f.read())
        
        results_list = [r for r in transcript_dict["results"]]
        last_result = results_list[-1]
        
        for word_dict in last_result["alternatives"][0]["words"]:
            endTime = float(word_dict["endTime"].replace("s",""))
            if endTime <= (60 * 10):  # 10 minutes
                transcript_list.append(word_dict["word"])
        
        transcript_string = " ".join(transcript_list)
        
        # clean the transcripts the same way as the descriptions
        transcript_string = clean_urls(transcript_string)
        transcript_string = transcript_string.encode("ascii", "ignore").decode()
        
        if transcript_string[-1] not in string.punctuation:
            transcript_string += "."
        
        return transcript_string


df = pd.read_csv("./csv/df.csv")

# set cols up for whisperx vs googleasr transcript comparison
df["transcript"] = df["transcript"].fillna("")
df = df.rename(columns={"transcript": "transcript_whisperx"})
df["transcript_googleasr"] = ""

result = pd.DataFrame()

display(df)

for i in range(5):
    # sample 100 at a time
    sampled_df_i = df.sample(n=100, random_state=i)

    for index, row in sampled_df_i.iterrows():
        # for those 100 files, retrieve the Google ASR transcription and write it into the df
        show_filename_prefix = row["show_filename_prefix"]
        episode_filename_prefix = row["episode_filename_prefix"]
        dir1 = show_filename_prefix.split("_")[1][0].upper()
        dir2 = show_filename_prefix.split("_")[1][1].upper()
        json_asr_file = os.path.join("/data2/maria/Spotify-Podcasts/podcasts-no-audio-13GB/spotify-podcasts-2020/podcasts-transcripts", dir1, dir2, show_filename_prefix, episode_filename_prefix+".json")

        sampled_df_i.loc[index,"transcript_googleasr"] = get_transcript_text_from_json_asr_file(json_asr_file)

    # calculate number of words in transcripts
    sampled_df_i["transcript_googleasr_length"] = -1
    for index, row in sampled_df_i.iterrows():
        # handle special case of empty string
        if row["transcript_googleasr"] != "":
            num_words = len(row["transcript_googleasr"].split(" "))
        else:
            num_words = 0
        # write in the number of words
        sampled_df_i.loc[index, "transcript_googleasr_length"] = num_words

    for target_str in ["uh", "um", "well"]:
        for asr_str in ["whisperx", "googleasr"]:
            sampled_df_i[f"{target_str}_count_{asr_str}"] = sampled_df_i[f"transcript_{asr_str}"].str.count(r"\b"+re.escape(target_str)+r"\b", flags=re.IGNORECASE)
            
    result = pd.concat([result, sampled_df_i], ignore_index=True)
    
display(result)
display(result.describe().round(2))

Unnamed: 0.1,Unnamed: 0,show_uri,show_name,show_description,publisher,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,language,transcript_whisperx,transcript_length,transcript_googleasr
0,0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj,en,"Hello, hello, hello everyone. This is Katie an...",1716,
1,2,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa,en,Welcome to Inside the 18. Today's episode is a...,2017,
2,3,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.189200,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm,en,"Hey Cheese fans! Before we get started, I want...",1518,
3,4,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.782050,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG,en,"Sorry to interrupt the show, but I do have to ...",1707,
4,5,spotify:show:5ljREb8VLogQLT7AKGwav1,UPSC Podcasts,Podcasts useful for UPSC aspirants! Mainly dis...,UPSC Podcast,https://anchor.fm/s/8afceec/podcast/rss,spotify:episode:0025w0gdgkl11Nzkmg1wnm,Tourism in India : Opportunities and Challenges,.,13.788000,show_5ljREb8VLogQLT7AKGwav1,0025w0gdgkl11Nzkmg1wnm,en,This is All India Radio. In the program Spotli...,1755,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82596,105355,spotify:show:416U8ZhubKrFHq8ynOaxfH,The Top 10,"Each week, John Rocha and Matt Knost breakdown...",The Top 10,http://thetop10.podomatic.com/rss2.xml,spotify:episode:7zzQnjBXqDApvnm1hLPzVY,The Top 10 - Re-List - Steve Martin Moves,Thanks to our patreon members for their suppor...,51.025850,show_416U8ZhubKrFHq8ynOaxfH,7zzQnjBXqDApvnm1hLPzVY,en,"Hey guys, this is John Rocha again. And Matt N...",1881,
82597,105356,spotify:show:5rgmBAzsJ5znpV2b4WNDsb,Let's Grab Coffee Podcast,"After connecting with someone, what's the next...",George Khalife,https://anchor.fm/s/9043d60/podcast/rss,spotify:episode:7zzRRsjuymax0YSczpi0SU,Let's Grab Coffee E45 with Ross Paquette | Gro...,Ross founded Maropost in 2011 as a customer-ce...,33.364750,show_5rgmBAzsJ5znpV2b4WNDsb,7zzRRsjuymax0YSczpi0SU,en,What's going on everyone? This is George Khali...,2005,
82598,105357,spotify:show:56CjYLQWyMx1MkOEQmlubi,Coach Corey Wayne,Life & Peak Performance Coach. I Teach Self-Re...,Coach Corey Wayne,https://anchor.fm/s/4dd625c/podcast/rss,spotify:episode:7zzZJGsL8fwDOrduUkX91D,Maybe She Is Just Testing Me?,How to know if your woman is maybe just testin...,11.799950,show_56CjYLQWyMx1MkOEQmlubi,7zzZJGsL8fwDOrduUkX91D,en,"Hi, I'm Coach Cory Wayne and this is my video ...",1850,
82599,105358,spotify:show:7uddSH8MhaK3Q6YFlllbVZ,The Cricket Podcast,The best & funniest independent cricket podcas...,The Cricket Podcast,https://anchor.fm/s/9d3dcf0/podcast/rss,spotify:episode:7zzoT4r0Rhffyegk2HJ9N8,Ep 16: England In Danger,"In Episode 16, the boys evaluate England's per...",69.215350,show_7uddSH8MhaK3Q6YFlllbVZ,7zzoT4r0Rhffyegk2HJ9N8,en,I think it should never be permitted to happen...,1699,


Unnamed: 0.1,Unnamed: 0,show_uri,show_name,show_description,publisher,rss_link,episode_uri,episode_name,episode_description,duration,...,transcript_whisperx,transcript_length,transcript_googleasr,transcript_googleasr_length,uh_count_whisperx,uh_count_googleasr,um_count_whisperx,um_count_googleasr,well_count_whisperx,well_count_googleasr
0,39292,spotify:show:4F8rZfDar6BAPF7Gb5Y8p4,The Dark Swamp: Horror Stories | Swamp Dweller...,Horror Stories to keep you up at night! Suppor...,Swamp Dweller,https://anchor.fm/s/a507954/podcast/rss,spotify:episode:2sezWffKSCO1gJkJbRVok9,The Dark Swamp: Horror Stories (Episode 117) I...,"In this episode, I share 5 creepy as hell stor...",40.963100,...,"Are you looking to make a podcast? Well, if yo...",1614,"Are you looking to make a podcast? Well, if yo...",1617,0,0,0,0,2,2
1,68685,spotify:show:63vGA37mcOYr7a2FISJouG,Kings Pulse: A Sacramento Kings Podcast,An in-depth Sacramento Kings podcast hosted by...,Brenden Nunes & Rich Ivanowski,https://anchor.fm/s/9b77e0c/podcast/rss,spotify:episode:53b7oxSccQxn3944gBUBjq,Buddy Hield extension updates w/ Jason Anderson,Buddy Hield extension talks are heati...,50.362533,...,Welcome back to another episode of the Kings P...,1506,Sacramento there it is. Welcome back to anothe...,1548,5,0,15,0,4,4
2,27441,spotify:show:0rwfoaIaZmEerjNBznckAz,Naylor's Natter in association with TDT,"""Naylor's natter...just talking to teachers"" ...",Phil Naylor,https://anchor.fm/s/85b24b4/podcast/rss,spotify:episode:21OdOBc3J1GBN4E0BefW49,Pupil Premium with Marc Rowland,This week I am in conversation with Pupil Prem...,63.292950,...,"Nailers Natter, just talking to teachers. Talk...",1614,That turd just talking to teachers talking to ...,1617,0,0,0,0,8,9
3,93363,spotify:show:1vvxqI1vuu0caee2onlzea,SUPERWHITE,Most polarizing in all of Southern Utah! 2 dud...,SUPERWHITE,https://anchor.fm/s/7fa81a4/podcast/rss,spotify:episode:6tlnZWUFIL0NdaBlPJBFA9,The VIP List Ft. Seanne Smith,Dallin's better than all of you Follow the po...,88.400683,...,"What's up, guys? We'd like to quickly shout ou...",1726,"What's up, guys? We'd like to quickly shout ou...",1371,0,1,7,0,5,3
4,89073,spotify:show:4053MooFmLA18LOnZvJtsJ,The Guava Girl Podcast by Isabella Silverio,The Guava Girl Podcast was created with the pu...,Isabella Silverio,https://anchor.fm/s/30d34ac/podcast/rss,spotify:episode:6ZfSHvekirPxpDJTwlSSFk,40: 100k CASH MONTH,Welcome to Episode 40 of the Guava Girl Podcas...,37.174417,...,What is up Guava Girls? Welcome back to the Gu...,1814,What is up? Guava Girls? Welcome back to the g...,1849,0,0,9,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,16826,spotify:show:4kDrnsi5UnX8uj2k0OML10,PILF,"Welcome to PILF, with Jess, your friendly neig...",PILF Podcast,https://anchor.fm/s/701b574/podcast/rss,spotify:episode:1EzthBju3gf0aAHJZVfihu,Ep. 13: Dating multiple cubs and managing feel...,Today I answer a question I get a lot - how I ...,22.439767,...,"Hey everyone, do you want to make a podcast of...",1726,Hey everyone. Do you want to make a podcast of...,1731,8,1,14,0,3,3
496,19666,spotify:show:1edI0qnZJwEDAY49gc9uBV,The Wanger Show,"In 2016, Christian Ruvalcaba, Cody Hall, and B...",The Wanger Show,https://thewangershow.podomatic.com/rss2.xml,spotify:episode:1S0HgykewGcjjHY2uwABdY,The Wanger Show #126 - Frank & Dorina Join Us ...,On this episode of The Wanger Show WE CELEBRAT...,82.664583,...,"Hey everybody, Christian here to tell you abou...",1639,Hey everybody Christian here to tell you about...,1601,0,0,2,0,4,3
497,33778,spotify:show:7KcSeWjWi8WTi2m6MEgbiv,Juice WRLD is Dead,Issa podcast Support this podcast: https://anc...,Ben Meyerson,https://anchor.fm/s/12c09344/podcast/rss,spotify:episode:2Th95WRFLcKWqBmnKaAIT4,Wagon wheels,"We hate Brody, he sucks. --- This episode i...",20.582950,...,"Hey guys, I'm Jake. And I'm Ben. And... Anchor...",1485,"Hey guys, I'm Jake and I'm Ben and anchor anch...",1438,4,0,6,0,6,3
498,25641,spotify:show:6bRby2ak8EziSLldK75IB5,JAZZIZ Backstage Pass,JAZZIZ Backstage Pass offers you unparalleled ...,Brian Zimmerman,https://anchor.fm/s/931b7f4/podcast/rss,spotify:episode:1t8YylHIXjxlWE2qOiLs3U,JAZZIZ Backstage Pass: Dave Koz Salutes the Gr...,Joining host Brian Zimmerman on the podcast fo...,19.418117,...,Jazz's backstage pass from the horn section. L...,1642,Jazz is backstage pass from the horn section. ...,1711,0,0,0,0,3,3


Unnamed: 0.1,Unnamed: 0,duration,transcript_length,transcript_googleasr_length,uh_count_whisperx,uh_count_googleasr,um_count_whisperx,um_count_googleasr,well_count_whisperx,well_count_googleasr
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,54319.5,40.2,1669.85,1676.56,1.25,0.1,1.65,0.2,3.48,3.51
std,30601.22,19.82,278.02,301.92,2.62,0.31,3.03,0.56,2.76,2.76
min,86.0,10.09,148.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27370.75,23.1,1524.0,1536.25,0.0,0.0,0.0,0.0,1.0,1.0
50%,54241.5,38.21,1682.0,1706.0,0.0,0.0,0.0,0.0,3.0,3.0
75%,82059.0,54.92,1848.0,1867.5,1.0,0.0,2.0,0.0,5.0,5.0
max,105307.0,89.72,2251.0,2308.0,18.0,2.0,24.0,4.0,17.0,17.0
