In [1]:
# Author:   Chris Graziul
# Created: 9/26/21
#
# Note:      Samples = frames here (frames are a set of samples in other contexts)
#
# Use:        Notebook used for developing VAD accuracy metrics and applying to pydub VAD data

In [1]:
import numpy as np
import pandas as pd
import pickle
import os

os.chdir('/project/graziul/')

In [None]:
# Function to get frame error rate (FER) = false positives (FP) + false negatives (FP)
def get_fer(v1,v2):
    

In [9]:
def get_filenames(df):
    try:
        df['file'] = df_transcripts['file'].str.split('(\d*-\d*-\d*)',expand=True)[1]+'.mp3'
        return df
    except:
        print('Error finding "file" variable in DataFrame')

def get_date(df, keep_date_info=False):
    try:
        df['date'] = df['year'].astype(str)+'_'+df['month'].astype(str).apply(lambda x: x.zfill(2))+'_'+df['day'].astype(str).apply(lambda x: x.zfill(2))
        if keep_date_info:
            return df
        else:
            return df[[i for i in df.columns.values if i not in ['year','month','day']]].copy()
    except:
        print('Error processing "year", "month", and/or "day" variables in DataFrame')

def dt_to_seconds(df):
    try:
        for i in ['start_dt', 'end_dt']:
            var_name = i.split('_')[0]
            df[var_name] = (df[i]-np.datetime64('1900-01-01T00:00:00.000000000')).dt.total_seconds()
        return df
    except:
        print('Error processing "start_dt" and/or "end_dt" variables in DataFrame')

def get_vad_vars(df):
    df = get_filenames(df)
    df = get_date(df)
    df = dt_to_seconds(df) 
    return df[['zone','date','time','file','start','end','transcriber']].copy()

In [10]:
df_transcripts = pd.read_csv('transcripts/transcripts2021_09_16.csv', parse_dates=['start_dt','end_dt'])
df_transcripts = get_vad_vars(df_transcripts)
df_transcripts.head()

Unnamed: 0,zone,date,time,file,start,end,transcriber
0,Zone1,2018_08_12,932,201808120932-28710-27730.mp3,141.252,151.279,huthealex
1,Zone1,2018_08_12,932,201808120932-28710-27730.mp3,158.109,159.417,huthealex
2,Zone1,2018_08_12,932,201808120932-28710-27730.mp3,168.327,169.235,huthealex
3,Zone1,2018_08_12,932,201808120932-28710-27730.mp3,175.33,177.437,huthealex
4,Zone1,2018_08_12,932,201808120932-28710-27730.mp3,184.003,189.017,huthealex


In [26]:
zone = 'Zone1'
date = '2018_08_12'
file = '201808120932-28710-27730.mp3'

In [56]:
def get_vad_transcriber(df, zone, date, file):
    df_temp = df[(df['zone']==zone)&(df['date']==date)&(df['file']==file)].copy()
    transcribers = df_temp['transcriber'].unique().tolist()
    vad_dict = {}
    if len(transcribers)==0:
        print("Error, no transcribers found")
    else:
        for transcriber in transcribers:
            df_vad_temp = df_temp[df_temp['transcriber']==transcriber].copy()
            vad_dict[transcriber] = ((1000*df_vad_temp[['start','end']]).astype(int).values.tolist())
    return vad_dict

In [57]:
vad_transcriber_dict = get_vad_transcriber(df_transcripts, zone, date, file)

In [59]:
vad_transcriber_dict

{'huthealex': [[141252, 151279],
  [158109, 159417],
  [168327, 169235],
  [175330, 177437],
  [184003, 189017],
  [189435, 191542],
  [192169, 194330],
  [231787, 235020],
  [850490, 851798],
  [856303, 861135],
  [1022968, 1024348],
  [1026982, 1028326],
  [1028908, 1030106],
  [1041768, 1044602],
  [1073085, 1077917],
  [1077917, 1081804],
  [1083712, 1088525]],
 'mhayford': [[142922, 149048],
  [158287, 159240],
  [168308, 169234],
  [175399, 177426],
  [184403, 188939],
  [189466, 194388],
  [231627, 234432],
  [850712, 851720],
  [856564, 861018],
  [1022978, 1024450],
  [1027271, 1028263],
  [1028990, 1030029],
  [1035924, 1041405],
  [1042003, 1044617],
  [1073409, 1081933],
  [1083949, 1088403]]}

In [27]:
vad_dict = pickle.load(open('/'.join(['data',zone,date,date+'vad_dict.pkl']),'rb'))
vad_pydub_slices = vad_dict[file]['pydub'][-24]['nonsilent_slices']
vad_pydub_slices

[[138745, 138793],
 [139556, 139862],
 [141264, 141265],
 [142018, 142136],
 [142951, 146356],
 [147119, 149542],
 [150932, 150934],
 [157276, 160008],
 [161419, 161423],
 [168539, 169606],
 [171162, 171165],
 [175166, 177388],
 [178131, 178293],
 [179870, 179874],
 [183899, 191433],
 [192390, 194725],
 [196299, 196303],
 [222502, 222548],
 [224035, 224038],
 [231900, 232406],
 [233021, 234784],
 [236364, 236367],
 [376729, 376772],
 [850966, 852337],
 [853882, 853886],
 [856479, 862014],
 [863151, 863857],
 [865413, 865417],
 [1022898, 1024871],
 [1026375, 1026378],
 [1027371, 1030559],
 [1032101, 1032104],
 [1035412, 1044984],
 [1046533, 1046536],
 [1072471, 1077797],
 [1078521, 1082468],
 [1084264, 1085522],
 [1087045, 1088404],
 [1088946, 1089112],
 [1090676, 1090680]]

In [24]:
# Next step: Convert slices for pysub, t1, t2 into three vectors of 0/1 (diff to get FP and FN)

{'201808112335-560782-27730.mp3': {'pydub': {-24: {'nonsilent_slices': [[0,
      6204],
     [7065, 7069],
     [7797, 8593],
     [9140, 9145],
     [47043, 49421],
     [50743, 63864],
     [64758, 64762],
     [65998, 66782],
     [67284, 67687],
     [254542, 255978],
     [256569, 256573],
     [257951, 263556],
     [264062, 264187],
     [264861, 269625],
     [271313, 271914],
     [272496, 272500],
     [385767, 387121],
     [387874, 387878],
     [389549, 389595],
     [390404, 390767],
     [391442, 391614],
     [392242, 395867],
     [397003, 406165],
     [406785, 406788],
     [408265, 410833],
     [419712, 421196],
     [421796, 421799],
     [423014, 423784],
     [424296, 431651],
     [432221, 433662],
     [434447, 434450],
     [435819, 436051],
     [436740, 437803],
     [445164, 447732],
     [448597, 449107],
     [449832, 449835],
     [508873, 508948],
     [510490, 511629],
     [512793, 515274],
     [515871, 523062],
     [524004, 524009],
     [525060,