## If you run this notebook for the first time or you use a new compute instance, you need to install all required libraries. 
You can open the console and use the following commands to install the required python library.

```bash
pip install -r requirements.txt 
pip install -U torch torchaudio --no-cache-dir
```

You also need to modify `output_root_folder` varialbe so it align with your intended output data folder path

In [1]:
import argparse
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics import recall_score, precision_score, \
    f1_score, average_precision_score, precision_recall_curve
from azureml.core import Workspace, Dataset
import mlflow
import matplotlib.pyplot as plt
# Connect to Workspace and reference Dataset



In [3]:
# Configure output folder path. 

# ! You need to modify this path to align with your local folder structure becuase it include {user name} here. 
output_root_folder="/mnt/batch/tasks/shared/LS_root/mounts/clusters/herman-gpu/code/Users/herman.wu/"
ml_experiment_name="orca-sound-resnet18-round1to3-FASTAI"
model_folder_name="models"
eval_pred_folder_name="eval_pred"
final_eval_folder_name="final_eval"

train_dataset_name="orcahello-audio-round1to3"
test_dataset_name="orcahello-audio-testdata-round1to3"
benchmark_dataset_name="orcahello-audio-perf-benchmark-data"
model_eval_test_dataset_name="orcahello-model-eval-testdata-01"
model_file_name="stg2-rn18.pkl"

In [4]:
# Connect to Workspace and reference Dataset
ws = Workspace.from_config()

model_eval_testdataset = ws.datasets[model_eval_test_dataset_name]

# Create mountcontext and mount the dataset
model_eva_testdata_mount_ctx = model_eval_testdataset.mount()  
model_eva_testdata_mount_ctx.start()  

# Get the mount point
test_dataset_mount_folder = model_eva_testdata_mount_ctx.mount_point
print(test_dataset_mount_folder)


Volume mount is not enabled. 
Falling back to dataflow mount.
/tmp/tmpdt56g99e


In [5]:
# List the files in the mount point
files = os.listdir(test_dataset_mount_folder)
print(files)

['.DS_Store', 'test - Copy.csv', 'test.tsv', 'wav-rpi', 'wav', 'wav2', 'wav3', 'wav4', 'wav5']


In [25]:

def quantize_interval_to_seconds(startTime, duration, maxDuration):
    """
    Returns a list of integers containing corresponding seconds. 
    If second N:N+1 contains part of the interval, N is counted. 
    """
    endTime = startTime + duration
    low = int(np.floor(startTime))
    high = min(int(np.ceil(endTime)), int(maxDuration))
    seconds = list(range(low, high+1))
    return seconds

def quantize_interval_df(df, startColumn, durationColumn, confColumn, maxDuration, threshold=None):
    '''
    Convert given intervals into a 1 second quantized examples for scoring 
    '''
    df = df.sort_values(startColumn).reset_index(drop=True)
    timeWindows = []
    for idx in range(df.shape[0]):
        startTime = df.loc[idx,startColumn]
        duration = df.loc[idx,durationColumn]
        confidence = 1.0 if confColumn is None else df.loc[idx, confColumn]
        
        for time_idx in quantize_interval_to_seconds(startTime, duration, maxDuration):
            timeWindows.append((time_idx, confidence))

    # unique operation merges overlapping windows 
    timeWindows = sorted(list(set(timeWindows)))
    idxs, confidences = zip(*timeWindows)

    if threshold is None:
        positiveIdxs = idxs
    else:
        positiveIdxs = [tup[0] for tup in timeWindows if tup[1] > threshold]
    
    ## Create dataframe quantized into 1-second time windows for scoring 
    quantized_df = pd.DataFrame({
        'timewindow': range(int(np.ceil(maxDuration))),
        'label': 0, 
        'confidence': 0.0
        })
    quantized_df.loc[positiveIdxs,'label'] = 1
    quantized_df.loc[idxs,'confidence'] = confidences

    return quantized_df

def score_quantized_examples(dataset, submissionQuantized, groundTruthQuantized, threshold):

    ## Evaluating
    precision, recall, thresholds = precision_recall_curve(groundTruthQuantized.label, submissionQuantized.confidence)
    class_prevalence = precision[0]

    metrics = dict()
    if threshold is not None:
        metrics['recall'] = round(recall_score(groundTruthQuantized.label, submissionQuantized.label), 3)
        metrics['precision'] = round(precision_score(groundTruthQuantized.label, submissionQuantized.label), 3)
        metrics['f1_score'] = round(f1_score(groundTruthQuantized.label, submissionQuantized.label), 3)

    metrics['AUPRC'] = round(average_precision_score(groundTruthQuantized.label, submissionQuantized.confidence), 3)

    auprc_curve = pd.DataFrame(dict(
        precision=precision,
        recall=recall,
        thresholds=[0.0, *thresholds]
    ))
    
    return metrics, auprc_curve, class_prevalence

def score_submission(testSetDir, submissionFile, threshold=None, verbose=False):
    # load Ground truth and submission 
    testData = pd.read_csv(Path(testSetDir)/"test.tsv", delimiter='\t')
    testWavDir = Path(testSetDir)/"wav2"
    submissionName = Path(submissionFile).stem
    # submissionData = pd.read_csv(submissionFile, sep='\t')
    submissionData = pd.read_csv(submissionFile)

    # iterate over (test set, wav file) -> aggregate scores appropriately
    # append for all wav files in a test set and score
    metrics_list = []
    auprc_curve_list = []
    for group in testData.groupby('dataset'):
        dataset, datasetGroundTruth = group

        print("\n###\nScoring dataset:", dataset)
        gt_list, sub_list = [], []
        total_duration = 0.0
        for wavGroup in tqdm(datasetGroundTruth.groupby('wav_filename')):
            wav_filename, groundTruth = wavGroup

            # retrieve intervals for this group (labels, submission)
            # TODO@Akash: remove this dependency to require the audio
            max_length = librosa.get_duration(filename=str(testWavDir/wav_filename))
            # Convert Ground Truth: HACK doesn't have confidence column, so passing dummy value  
            gt_list.append(quantize_interval_df(
                    groundTruth, 'start_time_s', 'duration_s', 'duration_s', max_length
                ))

            # Quantize submission file into 1-second time windows (timewindow, label, confidence)
            submission = submissionData.query('wav_filename == @wav_filename')
            sub_list.append(quantize_interval_df(
                submission, 'start_time_s', 'duration_s', 'confidence', max_length, threshold=threshold
                ))
            
            total_duration += max_length
        
        groundTruthQuantized = pd.concat(gt_list)
        submissionQuantized = pd.concat(sub_list)
        print("Total duration: {:.0f}:{:.0f}".format(total_duration//60, total_duration%60))
        if verbose:
            print("\nSnippet of converted/quantized ground truth file\n", groundTruthQuantized.head(3))
            print("\nSnippet of converted/quantized submission file\n", submissionQuantized.head(3))

        # score and aggregate results 
        metrics, auprc_curve, class_prevalence = score_quantized_examples(
            dataset, submissionQuantized, groundTruthQuantized, threshold
            )
        metrics_list.append({'dataset':dataset, **metrics})
        auprc_curve['dataset'] = dataset
        auprc_curve_list.append(auprc_curve)
    
    metrics = pd.DataFrame.from_records(metrics_list)
    overall = metrics.mean(numeric_only=True)
    overall['dataset'] = 'OVERALL'
    metrics = metrics.append(overall, ignore_index=True).round(3)
    
    return metrics, pd.concat(auprc_curve_list)



In [13]:
submissionFiles=[Path(os.path.join(output_root_folder,eval_pred_folder_name,"submission2SecFastAI.csv"))]


In [8]:
for submissionFile in submissionFiles:
    print(submissionFile)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/herman-gpu/code/Users/herman.wu/eval_pred/submission2SecFastAI.csv


In [9]:
# metricsFile = Path(submissionFiles[0]).parent/"metrics.tsv"
# resultsFile = Path(submissionFiles[0]).parent/"results.md"
# plotsFile = Path(submissionFiles[0]).parent/"{}.png".format("au_pr_curves")

metricsFile = Path(os.path.join(output_root_folder,final_eval_folder_name,"metrics.tsv"))
resultsFile = Path(os.path.join(output_root_folder,final_eval_folder_name,"results.md"))
plotsFile = Path(os.path.join(output_root_folder,final_eval_folder_name,"au_pr_curves.png"))

In [10]:
testSetDir=test_dataset_mount_folder
threshold=0.8

In [10]:
print(submissionFiles)

[PosixPath('/mnt/batch/tasks/shared/LS_root/mounts/clusters/herman-gpu/code/Users/herman.wu/eval_pred/submission2SecFastAI.csv')]


In [14]:
print(os.listdir("/mnt/batch/tasks/shared/LS_root/mounts/clusters/herman-gpu/code/Users/herman.wu/eval_pred/"))

['.amlignore', '.amlignore.amltmp', 'submission2SecFastAI.csv', 'test2Sec.csv']


In [12]:
pd.read_csv("/mnt/batch/tasks/shared/LS_root/mounts/clusters/herman-gpu/code/Users/herman.wu/eval_pred/submission2SecFastAI.csv")

Unnamed: 0,StartTime,Duration
0,21,1
1,22,1
2,23,1
3,51,1
4,52,1
...,...,...
511,1201,1
512,1202,1
513,1217,1
514,1218,1


In [24]:
testSetDir

'/tmp/tmpdt56g99e'

In [27]:
testData = pd.read_csv(Path(testSetDir)/"test.tsv", delimiter='\t')

In [28]:
testData.head(5)

Unnamed: 0,dataset,wav_filename,start_time_s,duration_s,location,date,pst_or_master_tape_identifier
0,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,52.172,1.118,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
1,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,54.877,1.104,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
2,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,69.701,2.691,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
3,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,72.765,0.795,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
4,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,73.51,0.925,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav


In [29]:
 for group in testData.groupby('dataset'):
        dataset, datasetGroundTruth = group

        print("\n###\nScoring dataset:", dataset)
        gt_list, sub_list = [], []
        total_duration = 0.0
        for wavGroup in tqdm(datasetGroundTruth.groupby('wav_filename')):
            wav_filename, groundTruth = wavGroup
            print(wav_filename)


###
Scoring dataset: podcast_test_round1
OS_7_05_2019_08_24_00_.wav

###
Scoring dataset: podcast_test_round2
OS_9_27_2017_08_14_00__0001.wav
OS_9_27_2017_08_19_00__0002.wav
OS_9_27_2017_08_25_00__0003.wav
OS_9_27_2017_08_25_00__0004.wav
OS_9_27_2017_08_30_00__0001.wav
OS_9_27_2017_08_30_00__0002.wav
OS_9_27_2017_08_30_00__0003.wav
OS_9_27_2017_08_35_00__0004.wav
OS_9_27_2017_09_02_00__0002.wav
OS_9_27_2017_09_34_00__0002.wav
OS_9_27_2017_10_28_00__0001.wav
OS_9_27_2017_10_49_00__0003.wav
OS_9_27_2017_11_00_00__0004.wav
OS_9_27_2017_11_05_00__0000.wav
OS_9_27_2017_11_26_00__0000.wav
OS_9_27_2017_11_26_00__0002.wav
OS_9_27_2017_11_32_00__0002.wav
OS_9_27_2017_11_32_00__0003.wav
OS_9_27_2017_11_42_00__0000.wav
OS_9_27_2017_11_53_00__0001.wav
OS_9_27_2017_11_58_00__0003.wav

###
Scoring dataset: podcast_test_round3
rpi-port-townsend_2019_11_14_12_33_00.wav
rpi-port-townsend_2019_11_14_12_34_00.wav
rpi-port-townsend_2019_11_14_12_47_00.wav
rpi-port-townsend_2019_11_14_12_48_00.wav
rpi-por

In [31]:
submissionData = pd.read_csv(submissionFile)
submissionData.head(5)

Unnamed: 0,StartTime,Duration
0,21,1
1,22,1
2,23,1
3,51,1
4,52,1


In [None]:
 for group in testData.groupby('dataset'):
        dataset, datasetGroundTruth = group

        print("\n###\nScoring dataset:", dataset)
        gt_list, sub_list = [], []
        total_duration = 0.0
        for wavGroup in tqdm(datasetGroundTruth.groupby('wav_filename')):
            wav_filename, groundTruth = wavGroup

            # retrieve intervals for this group (labels, submission)
            # TODO@Akash: remove this dependency to require the audio
            max_length = librosa.get_duration(filename=str(testWavDir/wav_filename))
            # Convert Ground Truth: HACK doesn't have confidence column, so passing dummy value  
            gt_list.append(quantize_interval_df(
                    groundTruth, 'start_time_s', 'duration_s', 'duration_s', max_length
                ))

            # Quantize submission file into 1-second time windows (timewindow, label, confidence)
            submission = submissionData.query('wav_filename == @wav_filename')
            sub_list.append(quantize_interval_df(
                submission, 'start_time_s', 'duration_s', 'confidence', max_length, threshold=threshold
                ))
            

In [26]:
# scoring multiple submission files 
metrics_list, auprc_list = [], []
for submissionFile in submissionFiles:
    print(submissionFile)
    metrics, auprc_curve = score_submission(testSetDir, submissionFile, threshold)
    # TODO@Akash: include class prevalence as a "no-skill" submission 
    metrics['submission'] = Path(submissionFile).stem
    metrics_list.append(metrics)
    auprc_curve['submission'] = Path(submissionFile).stem
    auprc_list.append(auprc_curve)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/herman-gpu/code/Users/herman.wu/eval_pred/submission2SecFastAI.csv

###
Scoring dataset: podcast_test_round1


UndefinedVariableError: name 'wav_filename' is not defined

In [23]:
! ls /tmp/tmpdt56g99e/wav2

OS_7_05_2019_08_24_00_.wav


In [20]:
! ls /tmp/tmpdt56g99e/wav

OS_9_27_2017_08_14_00__0001.wav  OS_9_27_2017_10_49_00__0003.wav
OS_9_27_2017_08_19_00__0002.wav  OS_9_27_2017_11_00_00__0004.wav
OS_9_27_2017_08_25_00__0003.wav  OS_9_27_2017_11_05_00__0000.wav
OS_9_27_2017_08_25_00__0004.wav  OS_9_27_2017_11_26_00__0000.wav
OS_9_27_2017_08_30_00__0001.wav  OS_9_27_2017_11_26_00__0002.wav
OS_9_27_2017_08_30_00__0002.wav  OS_9_27_2017_11_32_00__0002.wav
OS_9_27_2017_08_30_00__0003.wav  OS_9_27_2017_11_32_00__0003.wav
OS_9_27_2017_08_35_00__0004.wav  OS_9_27_2017_11_42_00__0000.wav
OS_9_27_2017_09_02_00__0002.wav  OS_9_27_2017_11_53_00__0001.wav
OS_9_27_2017_09_34_00__0002.wav  OS_9_27_2017_11_58_00__0003.wav
OS_9_27_2017_10_28_00__0001.wav


In [None]:
metrics = pd.concat(metrics_list)
metrics.to_csv(metricsFile, sep='\t', index=False)
print("Metrics written to", metricsFile)
print(metrics.set_index(['submission', 'dataset']))
metrics_table = metrics.pivot(index='dataset', columns='submission', values='AUPRC')
with open(resultsFile, 'w') as f:
    f.write(metrics_table.to_markdown())
    print("Results summary written to", resultsFile)

In [None]:
# aggregate and compile results from different submission files 
auprc_curve = pd.concat(auprc_list)
p = sns.FacetGrid(data=auprc_curve, col='dataset', row='submission', margin_titles=True)
p.map(sns.lineplot, 'recall', 'precision')
p.set(ylim=(0.,1.0))
p.set_titles(col_template="{col_name}", row_template="{row_name}")
plt.savefig(plotsFile)
print("Precision-Recall plots written to", plotsFile)

In [None]:

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-testSetDir', default=None, type=str, required=True)
    parser.add_argument('-submissionFiles', default=None, type=str, required=True)
    parser.add_argument('-threshold', default=None, type=float, required=False)
    args = parser.parse_args()

    submissionFiles = args.submissionFiles.split(',')
    metricsFile = Path(submissionFiles[0]).parent/"metrics.tsv"
    resultsFile = Path(submissionFiles[0]).parent/"results.md"
    plotsFile = Path(submissionFiles[0]).parent/"{}.png".format("au_pr_curves")

    # scoring multiple submission files 
    metrics_list, auprc_list = [], []
    for submissionFile in submissionFiles:
        metrics, auprc_curve = score_submission(args.testSetDir, submissionFile, args.threshold)
        # TODO@Akash: include class prevalence as a "no-skill" submission 
        metrics['submission'] = Path(submissionFile).stem
        metrics_list.append(metrics)
        auprc_curve['submission'] = Path(submissionFile).stem
        auprc_list.append(auprc_curve)
    
    metrics = pd.concat(metrics_list)
    metrics.to_csv(metricsFile, sep='\t', index=False)
    print("Metrics written to", metricsFile)
    print(metrics.set_index(['submission', 'dataset']))
    metrics_table = metrics.pivot(index='dataset', columns='submission', values='AUPRC')
    with open(resultsFile, 'w') as f:
        f.write(metrics_table.to_markdown())
        print("Results summary written to", resultsFile)
    
    # aggregate and compile results from different submission files 
    auprc_curve = pd.concat(auprc_list)
    p = sns.FacetGrid(data=auprc_curve, col='dataset', row='submission', margin_titles=True)
    p.map(sns.lineplot, 'recall', 'precision')
    p.set(ylim=(0.,1.0))
    p.set_titles(col_template="{col_name}", row_template="{row_name}")
    plt.savefig(plotsFile)
    print("Precision-Recall plots written to", plotsFile)