# Small_Scale_Transcripts_And_Analysis.ipynb
This notebook:
* Maps the podcast ids to their annotation labels.
* Obtains the diff metric across the annotators for the whole set, the non-scripted set, and the scripted set of podcasts.
* Writes in the WhisperX, Google ASR, GroundTruth-Min, GroundTruth-Max, and GroundTruth-netural podcasts into a df. 
* Saves as csv.

In [1]:
import utils_general

import os
import difflib
import pandas as pd
import json
import string
import re

from urlextract import URLExtract

# useful global variables
nums_to_ids = {
    "1" : "1ll5WGWjWANfGKHZfrpL5A",
    "2" : "1wHRMsWVurmo56xd1AcSfe",
    "3" : "3i4qZ6FqzXdj8UiMeuaX3q",
    "4" : "4YOq90opfiBsc3MwflzFbJ",
    "5" : "5a0dxl6w7wh5SGAbNpxhu4",
    "6" : "5TpkTe5mlb5xx1UT8OvZ6l",
    "7" : "6jyaOoQ6aF3TB8z93Honak",
    "8" : "6MaijKHa5MU0MsdS0yVZHE",
    "9" : "6URm69QBcvJpqoGiTE1nyy",
    "10" : "7n6zBPeHcH1t3MUvBNIvDa"
}

nums_to_labels = {
    "1" : "not-scripted",
    "2" : "scripted",
    "3" : "not-scripted",
    "4" : "not-scripted",
    "5" : "scripted",
    "6" : "scripted",
    "7" : "not-scripted",
    "8" : "not-scripted",
    "9" : "not-scripted",
    "10" : "scripted"
}

annotators = ["Anwesha", "Eric", "GroundTruth", "Soohwan"]

# initialize df
df = pd.DataFrame(index=range(1,10+1))

df["nums"] = range(1, 10 + 1)
df["nums"] = df["nums"].astype(str)

df["ids"] = df["nums"].map(nums_to_ids)
df["labels"] = df["nums"].map(nums_to_labels)

display(df)

Unnamed: 0,nums,ids,labels
1,1,1ll5WGWjWANfGKHZfrpL5A,not-scripted
2,2,1wHRMsWVurmo56xd1AcSfe,scripted
3,3,3i4qZ6FqzXdj8UiMeuaX3q,not-scripted
4,4,4YOq90opfiBsc3MwflzFbJ,not-scripted
5,5,5a0dxl6w7wh5SGAbNpxhu4,scripted
6,6,5TpkTe5mlb5xx1UT8OvZ6l,scripted
7,7,6jyaOoQ6aF3TB8z93Honak,not-scripted
8,8,6MaijKHa5MU0MsdS0yVZHE,not-scripted
9,9,6URm69QBcvJpqoGiTE1nyy,not-scripted
10,10,7n6zBPeHcH1t3MUvBNIvDa,scripted


In [2]:
def get_diff_counts(compare_diff):
    plus_count = 0
    minus_count = 0
    total_count = 0

    for i in compare_diff:
        word_diff = i.split()
        
        if len(word_diff) == 2:

            if word_diff[0] == "+":
                plus_count += 1

            if word_diff[0] == "-":
                minus_count += 1
                
            # we ignore "?", this is why: https://stackoverflow.com/questions/28452835/python-difflib-reporting-unwanted-difference

    total_count = plus_count + minus_count

    return minus_count, plus_count, total_count

def find_string_differences(str1, str2, str3):
    d = difflib.Differ()
    diff1_2 = list(d.compare(str1, str2))
    diff2_3 = list(d.compare(str2, str3))
    diff1_3 = list(d.compare(str1, str3))

    return diff1_2, diff2_3, diff1_3

# for the 10 files
for i in range(1,11):
    
    string1 = utils_general.read_file(os.path.join(utils_general.PATH_TO_PROJECT, "annotations", annotators[0], str(i)+".txt")).replace("’","'").split()
    string2 = utils_general.read_file(os.path.join(utils_general.PATH_TO_PROJECT, "annotations", annotators[1], str(i)+".txt")).split()
    string3 = utils_general.read_file(os.path.join(utils_general.PATH_TO_PROJECT, "annotations", annotators[2], str(i)+".txt")).split()
    
    diff1_2, diff2_3, diff1_3 = find_string_differences(string1, string2, string3)
    
    left1_2, right1_2, total1_2 = get_diff_counts(diff1_2)
    left2_3, right2_3, total2_3 = get_diff_counts(diff2_3)
    left1_3, right1_3, total1_3 = get_diff_counts(diff1_3)
    
    df.loc[i, "total_diff_avg"] = round(float(total1_2 + total2_3 + total1_3)/ 3.0, 2)
    
    df.loc[i, "left1_2"] = left1_2
    df.loc[i, "right1_2"] = right1_2
    df.loc[i, "total1_2"] = total1_2
    
    df.loc[i, "left2_3"] = left2_3
    df.loc[i, "right2_3"] = right2_3
    df.loc[i, "total2_3"] = total2_3
    
    df.loc[i, "left1_3"] = left1_3
    df.loc[i, "right1_3"] = right1_3
    df.loc[i, "total1_3"] = total1_3
    
    df.loc[i, "len1"] = len(string1)
    df.loc[i, "len2"] = len(string2)
    df.loc[i, "len3"] = len(string3)
    df.loc[i, "len_avg"] = round(float(len(string1) + len(string2) + len(string3))/ 3.0, 2)
    
df["percent_diff"] = round((df["total_diff_avg"]/df["len_avg"])*100,2)

# display percent diff results
display(df)
display(df.groupby("labels").agg({"percent_diff": ["mean", "min", "max", "count", "std", "median"]}).reset_index())

Unnamed: 0,nums,ids,labels,total_diff_avg,left1_2,right1_2,total1_2,left2_3,right2_3,total2_3,left1_3,right1_3,total1_3,len1,len2,len3,len_avg,percent_diff
1,1,1ll5WGWjWANfGKHZfrpL5A,not-scripted,38.0,27.0,29.0,56.0,4.0,2.0,6.0,26.0,26.0,52.0,267.0,269.0,267.0,267.67,14.2
2,2,1wHRMsWVurmo56xd1AcSfe,scripted,3.33,2.0,3.0,5.0,2.0,1.0,3.0,1.0,1.0,2.0,269.0,270.0,269.0,269.33,1.24
3,3,3i4qZ6FqzXdj8UiMeuaX3q,not-scripted,51.33,27.0,40.0,67.0,17.0,18.0,35.0,19.0,33.0,52.0,363.0,376.0,377.0,372.0,13.8
4,4,4YOq90opfiBsc3MwflzFbJ,not-scripted,26.67,19.0,17.0,36.0,7.0,10.0,17.0,13.0,14.0,27.0,295.0,293.0,296.0,294.67,9.05
5,5,5a0dxl6w7wh5SGAbNpxhu4,scripted,24.0,16.0,19.0,35.0,15.0,11.0,26.0,6.0,5.0,11.0,270.0,273.0,269.0,270.67,8.87
6,6,5TpkTe5mlb5xx1UT8OvZ6l,scripted,16.67,11.0,13.0,24.0,8.0,6.0,14.0,6.0,6.0,12.0,306.0,308.0,306.0,306.67,5.44
7,7,6jyaOoQ6aF3TB8z93Honak,not-scripted,39.33,33.0,25.0,58.0,9.0,14.0,23.0,20.0,17.0,37.0,321.0,313.0,318.0,317.33,12.39
8,8,6MaijKHa5MU0MsdS0yVZHE,not-scripted,26.67,19.0,21.0,40.0,8.0,8.0,16.0,11.0,13.0,24.0,256.0,258.0,258.0,257.33,10.36
9,9,6URm69QBcvJpqoGiTE1nyy,not-scripted,76.67,35.0,71.0,106.0,35.0,13.0,48.0,31.0,45.0,76.0,282.0,318.0,296.0,298.67,25.67
10,10,7n6zBPeHcH1t3MUvBNIvDa,scripted,15.33,11.0,12.0,23.0,6.0,6.0,12.0,5.0,6.0,11.0,288.0,289.0,289.0,288.67,5.31


Unnamed: 0_level_0,labels,percent_diff,percent_diff,percent_diff,percent_diff,percent_diff,percent_diff
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,count,std,median
0,not-scripted,14.245,9.05,25.67,6,5.937221,13.095
1,scripted,5.215,1.24,8.87,4,3.12086,5.375


In [3]:
# Google ASR functions
def clean_urls(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    for url in urls:
        text = text.replace(url, "")
    return text

# modified from https://github.com/potsawee/podcast_trec2020/blob/main/data/processor.py
# also performs some basic cleaning
def get_transcript_text_from_json_asr_file(json_asr_file):
    transcript_list = []
    with open(json_asr_file) as f:
        transcript_dict = json.loads(f.read())
        
        results_list = [r for r in transcript_dict["results"]]
        last_result = results_list[-1]
        
        for word_dict in last_result["alternatives"][0]["words"]:
            endTime = float(word_dict["endTime"].replace("s",""))
            if endTime <= (60 * 2):  # truncates to 2 minutes
                transcript_list.append(word_dict["word"])
        
        transcript_string = " ".join(transcript_list)
        
        # clean the transcripts the same way as the descriptions
        transcript_string = clean_urls(transcript_string)
        transcript_string = transcript_string.encode("ascii", "ignore").decode()
        
        if transcript_string != "" and transcript_string[-1] not in string.punctuation:
            transcript_string += "."
        
        return transcript_string

In [4]:
# WhisperX functions
def get_ascii_text(dictionary):
    text = ""
    
    # read text from json object
    for t in dictionary["segments"]:
        text += t["text"]
    text = text.strip()
    
    # only allow ascii characters
    text = text.encode("ascii", "ignore").decode()
        
    return text

In [5]:
# filter down df
df = df[["nums","ids","labels","percent_diff"]]

# merge with metadata_df
metadata_df = pd.read_csv(utils_general.PATH_TO_2020_TEST_DF, sep="\t")
df = pd.merge(df, metadata_df, left_on="ids", right_on="episode_filename_prefix", how="left")
df.index += 1

# for the 10 files, read in the GroundTruth, GroundTruth-punctuated, GoogleASR, and WhisperX transcripts
for index, row in df.iterrows():
    
    # read in the GroundTruth transcriptions
    df.loc[index, "GroundTruth"] = utils_general.read_file(os.path.join(utils_general.PATH_TO_PROJECT, "annotations", "GroundTruth", str(index)+".txt"))
    
    # read in the GroundTruth punctuated transcriptions
    df.loc[index, "GroundTruth-max"] = utils_general.read_file(os.path.join(utils_general.PATH_TO_PROJECT, "annotations", "GroundTruth-max", str(index)+".txt"))
    df.loc[index, "GroundTruth-neutral"] = utils_general.read_file(os.path.join(utils_general.PATH_TO_PROJECT, "annotations", "GroundTruth-neutral", str(index)+".txt"))
    df.loc[index, "GroundTruth-min"] = utils_general.read_file(os.path.join(utils_general.PATH_TO_PROJECT, "annotations", "GroundTruth-min", str(index)+".txt"))
    
    # read in the GoogleASR transcriptions
    show_filename_prefix = row["show_filename_prefix"]
    episode_filename_prefix = row["episode_filename_prefix"]
    dir1 = show_filename_prefix.split("_")[1][0].upper()
    dir2 = show_filename_prefix.split("_")[1][1].upper()
    json_asr_file = os.path.join("/data2/maria/Spotify-Podcasts/podcasts-no-audio-13GB/TREC/spotify-podcasts-2020/podcasts-transcripts-summarization-testset", dir1, dir2, show_filename_prefix, episode_filename_prefix+".json")
    df.loc[index, "GoogleASR"] = get_transcript_text_from_json_asr_file(json_asr_file)
    
    # read in the WhisperX transcriptions
    episode_id = row["ids"]
    with open(f"/data1/maria/Spotify-Podcasts/test-2min-whisperx-dir/{episode_id}/transcript.json") as f:
        dictionary = json.loads(f.read())
    df.loc[index, "WhisperX"] = get_ascii_text(dictionary)


In [6]:
def remove_punctuation_and_lower(text):
    text = text.lower()
    for p in "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~":  # removed ' character from string.punctuation to keep ' in words like "you're"
        text = text.replace(p, "")
    
    return text

df["rpl_GroundTruth"] = df["GroundTruth"].apply(remove_punctuation_and_lower) 
df["rpl_GoogleASR"] = df["GoogleASR"].apply(remove_punctuation_and_lower)
df["rpl_WhisperX"] = df["WhisperX"].apply(remove_punctuation_and_lower)

In [8]:
# save the metadata df with the transcripts
df.to_csv("./csv/small_scale_texts.csv", header=True)
display(df)

Unnamed: 0,nums,ids,labels,percent_diff,show_uri,show_name,show_description,publisher,language,rss_link,...,episode_filename_prefix,GroundTruth,GroundTruth-max,GroundTruth-neutral,GroundTruth-min,GoogleASR,WhisperX,rpl_GroundTruth,rpl_GoogleASR,rpl_WhisperX
1,1,1ll5WGWjWANfGKHZfrpL5A,not-scripted,14.2,spotify:show:5HLsz7WFjW8hzJurMDdozi,Granger Smith Podcast,"American, Texan, father, husband, musician, Gr...",Granger Smith,['en'],https://anchor.fm/s/11170a28/podcast/rss,...,1ll5WGWjWANfGKHZfrpL5A,if you're ever in deep deep grief and you wond...,"If you're ever in deep, deep grief, and you wo...","If you're ever in deep, deep grief and you won...","If you're ever in deep, deep grief and you won...",If you're ever in deep deep grief and you wond...,"If you're ever in deep, deep grief and you won...",if you're ever in deep deep grief and you wond...,if you're ever in deep deep grief and you wond...,if you're ever in deep deep grief and you wond...
2,2,1wHRMsWVurmo56xd1AcSfe,scripted,1.24,spotify:show:1g056e2x0Y9AwW6CQF3qA5,Mythology,Myths endure for a reason. This episodic audio...,Parcast Network,['en'],https://feeds.megaphone.fm/mythology,...,1wHRMsWVurmo56xd1AcSfe,something to note all myths have many versions...,Something to note: all myths have many version...,Something to note: all myths have many version...,Something to note: all myths have many version...,Something to note all myths have many versions...,"Something to note, all myths have many version...",something to note all myths have many versions...,something to note all myths have many versions...,something to note all myths have many versions...
3,3,3i4qZ6FqzXdj8UiMeuaX3q,not-scripted,13.8,spotify:show:2z1LtWVnflRUONFAo0FADb,Gamecock Central Podcast Network,The Gamecock Central Podcast Network brings yo...,Gamecock Central Podcasts,['en'],https://www.spreaker.com/show/957147/episodes/...,...,3i4qZ6FqzXdj8UiMeuaX3q,hello and welcome back into another episode of...,"Hello, and welcome back into another episode o...",Hello and welcome back into another episode of...,Hello and welcome back into another episode of...,Hello and welcome back in to another episode o...,Hello and welcome back into another episode of...,hello and welcome back into another episode of...,hello and welcome back in to another episode o...,hello and welcome back into another episode of...
4,4,4YOq90opfiBsc3MwflzFbJ,not-scripted,9.05,spotify:show:4O6pIMQHq2GkVhwQ0KNdnH,The Frog and The Dragon,A podcast devoted to everything Magic: the Gat...,The Frog & The Dragon,['en'],https://anchor.fm/s/1338c918/podcast/rss,...,4YOq90opfiBsc3MwflzFbJ,hello everyone and welcome to the frog and the...,"Hello, everyone, and welcome to ""The Frog and ...","Hello everyone, and welcome to The Frog and th...","Hello everyone, and welcome to The Frog and Th...",Hello everyone and welcome to frog and the dra...,Hello everyone and welcome to the Frog and the...,hello everyone and welcome to the frog and the...,hello everyone and welcome to frog and the dra...,hello everyone and welcome to the frog and the...
5,5,5a0dxl6w7wh5SGAbNpxhu4,scripted,8.87,spotify:show:21ASCcEXgUlbFSmoqjroZm,Chompers,Make tooth time easy with this morning and nig...,Gimlet,['en'],https://feeds.megaphone.fm/chompers,...,5a0dxl6w7wh5SGAbNpxhu4,good morning we hope you're ready to have a su...,Good morning! We hope you're ready to have a s...,Good morning! We hope you're ready to have a s...,"Good morning, we hope you're ready to have a s...",Good morning. We hope you're ready to have a s...,Good morning. We hope you're ready to have a s...,good morning we hope you're ready to have a su...,good morning we hope you're ready to have a su...,good morning we hope you're ready to have a su...
6,6,5TpkTe5mlb5xx1UT8OvZ6l,scripted,5.44,spotify:show:4gds4Ip86RDAlAgH4PQuQs,Crime Over Coffee,Listen in each week while true crime enthusias...,Crime Over Coffee,['en'],https://anchor.fm/s/b402dc8/podcast/rss,...,5TpkTe5mlb5xx1UT8OvZ6l,good morning and welcome to another mini monda...,"Good morning, and welcome to another mini Mond...","Good morning, and welcome to another Mini Mond...","Good morning, and welcome to another Mini Mond...","Good morning, and welcome to another mini Mond...","Good morning, and welcome to another mini Mond...",good morning and welcome to another mini monda...,good morning and welcome to another mini monda...,good morning and welcome to another mini monda...
7,7,6jyaOoQ6aF3TB8z93Honak,not-scripted,12.39,spotify:show:6JqOLHQRh2G24ItUTlmxu9,Legends Cast - A Podcast About Legends of Rune...,"A podcast about the meta, cards, and community...",The Lift,['en'],https://anchor.fm/s/c4221b8/podcast/rss,...,6jyaOoQ6aF3TB8z93Honak,you're listening to legends cast a podcast abo...,"You're listening to Legends Cast, a podcast ab...","You're listening to Legends Cast, a podcast ab...","You're listening to Legends Cast, a podcast ab...",You're listening to Legends cast a podcast abo...,"You're listening to Legendscast, a podcast abo...",you're listening to legends cast a podcast abo...,you're listening to legends cast a podcast abo...,you're listening to legendscast a podcast abou...
8,8,6MaijKHa5MU0MsdS0yVZHE,not-scripted,10.36,spotify:show:0cTycvoAlE6LSppQHpgr48,Align By Design,Align by Design with Amy Allchurch is the podc...,Amy Allchurch,['en'],https://anchor.fm/s/de57c04/podcast/rss,...,6MaijKHa5MU0MsdS0yVZHE,welcome to align by design i'm amy allchurch a...,"Welcome to Align by Design; I'm Amy Allchurch,...","Welcome to Align by Design, I'm Amy Allchurch,...","Welcome to Align by Design, I'm Amy Allchurch,...",Welcome to 9 by Design. I'm eating all church ...,"Welcome to Align by Design. I'm Amy Alchurch, ...",welcome to align by design i'm amy allchurch a...,welcome to 9 by design i'm eating all church a...,welcome to align by design i'm amy alchurch an...
9,9,6URm69QBcvJpqoGiTE1nyy,not-scripted,25.67,spotify:show:40tQwCsD43wEfTZuDuZ66c,Let's All Be Serious,Imagine sitting around a table with three of y...,Banger,['en'],https://anchor.fm/s/f7a6444/podcast/rss,...,6URm69QBcvJpqoGiTE1nyy,so is that all they do for icy hot those lying...,"So, is that all they do for Icy Hot, those lyi...","So, is that all they do for Icy Hot? Those lyi...","So, is that all they do for Icy Hot, those lyi...",So that all they do for Icy Hot those Lyin mot...,"So is that all they do for Icy Hot, those lyin...",so is that all they do for icy hot those lying...,so that all they do for icy hot those lyin mot...,so is that all they do for icy hot those lying...
10,10,7n6zBPeHcH1t3MUvBNIvDa,scripted,5.31,spotify:show:5PFgy6lc8UvYIjDsaaO3dN,Crimes of Passion,"Every Wednesday, we tell a love story that tur...",Parcast Network,['en'],https://feeds.megaphone.fm/crimesofpassion,...,7n6zBPeHcH1t3MUvBNIvDa,welcome to podcast crime bites we're offering ...,"Welcome to Podcast Crime Bites, where we're of...",Welcome to Podcast Crime Bites. We're offering...,"Welcome to Podcast Crime Bites, we're offering...",Welcome to park asked crime bites. We're offer...,Welcome to ParCast Crime Bites. We're offering...,welcome to podcast crime bites we're offering ...,welcome to park asked crime bites we're offeri...,welcome to parcast crime bites we're offering ...
