In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm
from scipy import stats

orig_df = pd.read_csv("../csv/df.csv")
num_total = len(orig_df)

# Specify dtype for columns 1, 2, and 3
dtype_dict = {"parse": str, "orig_dys": str, "dys": str}

df0 = pd.read_csv("../csv/df-english-fisher-annotations-0.csv", dtype=dtype_dict)
df1 = pd.read_csv("../csv/df-english-fisher-annotations-1.csv", dtype=dtype_dict)

df = pd.concat([df0, df1], ignore_index=True)

# if no words were transcribed, replace nan's with empty string
df["transcript"] = df["transcript"].fillna("")

display(df.head())

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,show_uri,show_name,show_description,publisher,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,language,transcript,transcript_length,parse,orig_dys,dys
0,0,0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj,en,"Hello, hello, hello everyone. This is Katie an...",1716,(FRAG (INTJ (UNK hello)) (INTJ (UNK hello)) (I...,hello _ hello _ hello _ everyone _\nthis _ is ...,hello E hello E hello E everyone _ this _ is _...
1,1,2,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa,en,Welcome to Inside the 18. Today's episode is a...,2017,(ADJP (UNK welcome) (PP (UNK to) (PP (UNK insi...,welcome _ to _ inside _ the _\ntoday _ 's _ ep...,welcome _ to _ inside _ the _ today _ 's _ epi...
2,2,3,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.1892,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm,en,"Hey Cheese fans! Before we get started, I want...",1518,(FRAG (INTJ (UNK hey)) (NP (UNK cheese) (UNK f...,hey _ cheese _ fans _\nbefore _ we _ get _ sta...,hey E cheese _ fans _ before _ we _ get _ star...
3,3,4,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.78205,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG,en,"Sorry to interrupt the show, but I do have to ...",1707,(S (ADJP (UNK sorry) (S (VP (UNK to) (VP (UNK ...,sorry _ to _ interrupt _ the _ show _ but _ i ...,sorry _ to _ interrupt _ the _ show _ but _ i ...
4,4,5,spotify:show:5ljREb8VLogQLT7AKGwav1,UPSC Podcasts,Podcasts useful for UPSC aspirants! Mainly dis...,UPSC Podcast,https://anchor.fm/s/8afceec/podcast/rss,spotify:episode:0025w0gdgkl11Nzkmg1wnm,Tourism in India : Opportunities and Challenges,.,13.788,show_5ljREb8VLogQLT7AKGwav1,0025w0gdgkl11Nzkmg1wnm,en,This is All India Radio. In the program Spotli...,1755,(S (NP (UNK this)) (VP (UNK is) (NP (UNK all) ...,this _ is _ all _ india _ radio _\nin _ the _ ...,this _ is _ all _ india _ radio _ in _ the _ p...


In [2]:
import numpy as np
display(len(df[df["parse"].isna()]))

0

In [3]:
# check on progress
num_completed = len(df.dropna(subset=["parse"]))
print(num_completed)
print(num_total)
print(((num_completed/num_total)*100.0), "percent completed so far")

82601
82601
100.0 percent completed so far


In [4]:
# Function to count occurrences of a specified substring in a string
def count_substring_occurrences(s, substring):
    return s.count("("+substring)

# Apply the function to create a new column with dynamic names
def create_new_count_col(substring_to_count, column_to_search):
    new_column_name = f"{column_to_search}_{substring_to_count}_count"
    df[new_column_name] = df[column_to_search].apply(count_substring_occurrences, substring=substring_to_count)
    
for label in ["INTJ", "EDITED", "PRN", "ADJP", "ADVP", "NP", "PP", "S", "SBAR", "SBARQ", "SINV", "SQ", "VP", "WHADVP", "WHNP", "WHPP", "X"]:
    create_new_count_col(substring_to_count=label, column_to_search="parse")
    
pd.set_option('display.max_columns', None)
display(df)

display(df.describe())

df.to_csv("../csv/df-parse.csv", header=True)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,show_uri,show_name,show_description,publisher,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,language,transcript,transcript_length,parse,orig_dys,dys,parse_INTJ_count,parse_EDITED_count,parse_PRN_count,parse_ADJP_count,parse_ADVP_count,parse_NP_count,parse_PP_count,parse_S_count,parse_SBAR_count,parse_SBARQ_count,parse_SINV_count,parse_SQ_count,parse_VP_count,parse_WHADVP_count,parse_WHNP_count,parse_WHPP_count,parse_X_count
0,0,0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj,en,"Hello, hello, hello everyone. This is Katie an...",1716,(FRAG (INTJ (UNK hello)) (INTJ (UNK hello)) (I...,hello _ hello _ hello _ everyone _\nthis _ is ...,hello E hello E hello E everyone _ this _ is _...,19,12,8,29,104,609,136,405,109,5,1,4,435,15,37,0,0
1,1,2,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa,en,Welcome to Inside the 18. Today's episode is a...,2017,(ADJP (UNK welcome) (PP (UNK to) (PP (UNK insi...,welcome _ to _ inside _ the _\ntoday _ 's _ ep...,welcome _ to _ inside _ the _ today _ 's _ epi...,34,16,18,48,106,734,206,467,115,11,2,12,483,15,32,0,8
2,2,3,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.189200,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm,en,"Hey Cheese fans! Before we get started, I want...",1518,(FRAG (INTJ (UNK hey)) (NP (UNK cheese) (UNK f...,hey _ cheese _ fans _\nbefore _ we _ get _ sta...,hey E cheese _ fans _ before _ we _ get _ star...,33,23,16,31,90,532,127,365,82,7,0,7,389,15,17,0,1
3,3,4,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.782050,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG,en,"Sorry to interrupt the show, but I do have to ...",1707,(S (ADJP (UNK sorry) (S (VP (UNK to) (VP (UNK ...,sorry _ to _ interrupt _ the _ show _ but _ i ...,sorry _ to _ interrupt _ the _ show _ but _ i ...,66,20,7,53,105,626,145,383,86,5,0,6,426,13,19,0,3
4,4,5,spotify:show:5ljREb8VLogQLT7AKGwav1,UPSC Podcasts,Podcasts useful for UPSC aspirants! Mainly dis...,UPSC Podcast,https://anchor.fm/s/8afceec/podcast/rss,spotify:episode:0025w0gdgkl11Nzkmg1wnm,Tourism in India : Opportunities and Challenges,.,13.788000,show_5ljREb8VLogQLT7AKGwav1,0025w0gdgkl11Nzkmg1wnm,en,This is All India Radio. In the program Spotli...,1755,(S (NP (UNK this)) (VP (UNK is) (NP (UNK all) ...,this _ is _ all _ india _ radio _\nin _ the _ ...,this _ is _ all _ india _ radio _ in _ the _ p...,9,10,1,29,56,647,189,289,74,3,0,3,325,10,24,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82596,82596,105355,spotify:show:416U8ZhubKrFHq8ynOaxfH,The Top 10,"Each week, John Rocha and Matt Knost breakdown...",The Top 10,http://thetop10.podomatic.com/rss2.xml,spotify:episode:7zzQnjBXqDApvnm1hLPzVY,The Top 10 - Re-List - Steve Martin Moves,Thanks to our patreon members for their suppor...,51.025850,show_416U8ZhubKrFHq8ynOaxfH,7zzQnjBXqDApvnm1hLPzVY,en,"Hey guys, this is John Rocha again. And Matt N...",1881,(S (INTJ (UNK hey)) (INTJ (UNK guys)) (NP (UNK...,hey _ guys _ this _ is _ john _ rocha _ again ...,hey E guys E this _ is _ john _ rocha _ again ...,110,26,15,38,91,699,159,418,87,11,1,19,433,14,27,0,0
82597,82597,105356,spotify:show:5rgmBAzsJ5znpV2b4WNDsb,Let's Grab Coffee Podcast,"After connecting with someone, what's the next...",George Khalife,https://anchor.fm/s/9043d60/podcast/rss,spotify:episode:7zzRRsjuymax0YSczpi0SU,Let's Grab Coffee E45 with Ross Paquette | Gro...,Ross founded Maropost in 2011 as a customer-ce...,33.364750,show_5rgmBAzsJ5znpV2b4WNDsb,7zzRRsjuymax0YSczpi0SU,en,What's going on everyone? This is George Khali...,2005,(SBARQ (WHNP (UNK what)) (SQ (VP (UNK 's) (VP ...,what _ 's _ going _ on _ everyone _\nthis _ is...,what _ 's _ going _ on _ everyone _ this _ is ...,25,26,65,34,136,681,210,425,93,10,0,12,450,20,30,0,0
82598,82598,105357,spotify:show:56CjYLQWyMx1MkOEQmlubi,Coach Corey Wayne,Life & Peak Performance Coach. I Teach Self-Re...,Coach Corey Wayne,https://anchor.fm/s/4dd625c/podcast/rss,spotify:episode:7zzZJGsL8fwDOrduUkX91D,Maybe She Is Just Testing Me?,How to know if your woman is maybe just testin...,11.799950,show_56CjYLQWyMx1MkOEQmlubi,7zzZJGsL8fwDOrduUkX91D,en,"Hi, I'm Coach Cory Wayne and this is my video ...",1850,(S (INTJ (UNK hi)) (NP (UNK i)) (VP (UNK 'm) (...,hi _ i _ 'm _ coach _ cory _ wayne _ and _ thi...,hi E i _ 'm _ coach _ cory _ wayne _ and _ thi...,21,6,1,49,115,613,133,507,118,7,0,6,561,26,30,0,0
82599,82599,105358,spotify:show:7uddSH8MhaK3Q6YFlllbVZ,The Cricket Podcast,The best & funniest independent cricket podcas...,The Cricket Podcast,https://anchor.fm/s/9d3dcf0/podcast/rss,spotify:episode:7zzoT4r0Rhffyegk2HJ9N8,Ep 16: England In Danger,"In Episode 16, the boys evaluate England's per...",69.215350,show_7uddSH8MhaK3Q6YFlllbVZ,7zzoT4r0Rhffyegk2HJ9N8,en,I think it should never be permitted to happen...,1699,(S (NP (UNK i)) (VP (UNK think) (SBAR (S (NP (...,i _ think _ it _ should _ never _ be _ permitt...,i _ think _ it _ should _ never _ be _ permitt...,58,8,3,45,94,593,128,379,87,6,0,32,408,9,23,0,0


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,duration,transcript_length,parse_INTJ_count,parse_EDITED_count,parse_PRN_count,parse_ADJP_count,parse_ADVP_count,parse_NP_count,parse_PP_count,parse_S_count,parse_SBAR_count,parse_SBARQ_count,parse_SINV_count,parse_SQ_count,parse_VP_count,parse_WHADVP_count,parse_WHNP_count,parse_WHPP_count,parse_X_count
count,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0,82601.0
mean,41300.0,52733.532996,40.437903,1682.654641,50.89787,16.7101,10.840462,38.783477,87.32686,596.184489,143.622777,388.333592,92.281219,5.505527,0.365359,9.788417,425.756008,14.513904,27.174998,0.136342,0.345311
std,23844.99913,30406.734353,20.097331,267.513855,39.875698,9.584463,9.793306,12.398499,27.166828,98.502935,31.326437,85.953114,25.818001,4.499144,0.753722,7.575123,89.265225,6.674158,10.724439,0.473211,1.323469
min,0.0,0.0,10.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20650.0,26444.0,23.721117,1530.0,19.0,10.0,4.0,30.0,70.0,540.0,124.0,338.0,76.0,2.0,0.0,4.0,372.0,10.0,20.0,0.0,0.0
50%,41300.0,52777.0,38.614783,1702.0,42.0,15.0,9.0,38.0,87.0,600.0,143.0,393.0,92.0,5.0,0.0,8.0,430.0,14.0,26.0,0.0,0.0
75%,61950.0,79085.0,54.848433,1859.0,74.0,22.0,15.0,46.0,104.0,659.0,163.0,446.0,109.0,8.0,1.0,14.0,485.0,18.0,33.0,0.0,0.0
max,82600.0,105359.0,304.9539,2650.0,310.0,119.0,169.0,155.0,432.0,1278.0,318.0,743.0,221.0,63.0,25.0,79.0,816.0,103.0,245.0,10.0,110.0
