In [1]:
import pandas as pd
import pydub

In [1]:
# Two basic quality control strategies:
#  1. Ask workers to transcribe "gold standard" audio then assess performance (speed vs. accuracy)
#  2. Insert audio snippets with known transcriptions into assigned files at random then check once finished (accuracy)

In [2]:
# Strategy 1:
# - Identify "gold standard" file and accompanying transcription
# - Create function to assess accuracy of transcription

In [1]:
# Strategy 2:
# - Build dictionary of transcribed snippets (as context-free as possible)
# - Randomly decide how many to include in a file
# - Randomly select the appropriate number of snippets from the dictionary
# - Randomly select when/where to add snippets
# - Combine non-silent snippets and QC snippets, building dict of timing/labels for QC snippets
# - Assign file, receive student transcription
# - Look for QC snippets based on timing (need a window, much easier now with artificial silences)
# - Extract QC snippets from transcript
# - Save transcript without QC snippets
# - Assess error rate in QC snippet transcriptions
# - Save QC snippet error rate as metadata

# Build dict of {audio:transcription}

In [3]:
# Find audio where transcribers agree on transcription 
df = pd.read_csv('transcripts.csv')
df.head()

Unnamed: 0,feed,zone,year,month,day,time,sid,start,end,transcription,notes,file,transcriber,start_c,start_dt,end_c,end_dt,length,length_s
0,27730,Zone1,2018,8,12,932,RADIO,00.02.21.252,00.02.31.279,RADIOSHOP TESTING ONE TWO THREE FOUR FIVE FIVE...,,201808120932-28710-27730ah.xlsx,huthealex,00.02.21.252,1900-01-01 00:02:21.252000,00.02.31.279,1900-01-01 00:02:31.279000,0 days 00:00:10.027000000,10.027
1,27730,Zone1,2018,8,12,932,RADIO,00.02.38.109,00.02.39.417,ONE TWO ONE TWO,,201808120932-28710-27730ah.xlsx,huthealex,00.02.38.109,1900-01-01 00:02:38.109000,00.02.39.417,1900-01-01 00:02:39.417000,0 days 00:00:01.308000000,1.308
2,27730,Zone1,2018,8,12,932,UNIT,00.02.48.327,00.02.49.235,UNIT COMING IN,,201808120932-28710-27730ah.xlsx,huthealex,00.02.48.327,1900-01-01 00:02:48.327000,00.02.49.235,1900-01-01 00:02:49.235000,0 days 00:00:00.908000000,0.908
3,27730,Zone1,2018,8,12,932,UNIT,00.02.55.330,00.02.57.437,ZONE ONE IS ON CITY [WIDE] FIVE,,201808120932-28710-27730ah.xlsx,huthealex,00.02.55.330,1900-01-01 00:02:55.330000,00.02.57.437,1900-01-01 00:02:57.437000,0 days 00:00:02.107000000,2.107
4,27730,Zone1,2018,8,12,932,RADIO,00.03.04.003,00.03.09.017,OKAY THANKS UH THIS IS THE RADIO SHOP TESTING ...,,201808120932-28710-27730ah.xlsx,huthealex,00.03.04.003,1900-01-01 00:03:04.003000,00.03.09.017,1900-01-01 00:03:09.017000,0 days 00:00:05.014000000,5.014


In [23]:
df_grouped_files = df.groupby(['feed','zone','year','month','day','time'])
same_text = []
for f, f_df in df_grouped_files:
    transcribers = f_df['transcriber'].unique().tolist()
    if len(transcribers)>1:
        f_df1 = f_df[f_df['transcriber']==transcribers[0]]
        f_df2 = f_df[f_df['transcriber']==transcribers[1]]
        same_text += [list(f)+f_df1.loc[f_df1['transcription'].isin(f_df2['transcription']),'transcription'].tolist()]
len(same_text), len(df_grouped_files)

(62, 125)

In [22]:
same_text

[[27730,
  'Zone1',
  2018,
  8,
  4,
  2331,
  'SIXTEEN TWENTY TWO ROBERT',
  'TWENTY TWO',
  'SEVENTEEN TWENTY FOUR ROBERT',
  'TWENTY FOUR ROBERT',
  'THANK YOU',
  'THIRTEEN ROBERT',
  'SEVENTEEN THIRTEEN ROBERT',
  'SIXTEEN THIRTY FOUR ROBERT',
  'FOUR ROBERT',
  'NO',
  'SIXTEEN FOURTEEN ROBERT',
  'FOURTEEN ROBERT',
  'TEN FOUR',
  'SIXTEEN ELEVEN ROBERT',
  'ELEVEN ROBERT',
  'TWENTY THREE ROBERT',
  'TWENTY THREE ROBERT',
  'FIVE THREE SEVEN FOUR',
  'SIXTEEN THIRTY THREE ROBERT',
  'ONE SIX THREE THREE ROBERT',
  'NO THEY RETURNED TO QUARTERS',
  'NOW SEVENTEEN THIRTEEN ROBERT',
  'OH',
  'SIXTEEN TWENTY FOUR ROBERT',
  'ONE SIX TWO FOUR ROBERT',
  'ONE SIX TWO FOUR ROBERT',
  'SEVEN TWENTY THREE ROBERT',
  'SEVENTEEN NINETY FIVE ROBERT',
  'ONE SEVEN NINE FIVE ROBERT',
  'SEVENTEEN NINETY FIVE',
  'SIXTEEN THIRTY THREE ROBERT',
  'VERY GOOD THANK YOU',
  'SIXTEEN THIRTY THREE ROBERT',
  'THIRTY THREE ROBERT',
  'WONDERFUL',
  'TWENTY ONE ROBERT',
  'SIXTEEN THIRTY TWO ROBERT

In [None]:
def check_if_same_transcription(x):
    

In [None]:
qc_snippets_dict = {}

In [None]:
# Find snippets for use - use transcriptions, select cases where transcribers agree (perhaps t=3 only)
qc_snippets_dict = {}

In [None]:
# Minimum number of snippets 
min_num_snippets = 2

In [None]:
# Minimum number of words (for %WER calculation)
min_num_words = 10

In [None]:
# Select combinatoin of snippets until both min_num_snippets and min_num_words satisfied


In [3]:
#
test = {'1':'test','2':'testing','3':'tested'}

In [4]:
import random

In [13]:
random.sample(list(test.items()),k=2)

[('2', 'testing'), ('1', 'test')]

In [None]:
# Calculate %WER
from jiwer import wer
error = wer(gt, hyp)
