In [7]:
import pathlib
import os
import pandas as pd

In [8]:
code_paths = {}
code_paths["repo_name"] = "miso-sound-annotate"

code_paths["repo_path"] = os.getcwd()
base_dir = os.path.basename(code_paths["repo_path"])
while base_dir != code_paths["repo_name"]:
    code_paths["repo_path"] = os.path.dirname(
        os.path.abspath(code_paths["repo_path"]))
    base_dir = os.path.basename(code_paths["repo_path"])


In [15]:
package_dir = code_paths["repo_path"]

label_dir = pathlib.Path(package_dir, "labels")
label_file_paths = list(pathlib.Path(label_dir).glob("*.txt"))


In [85]:
all_records = []
for p in label_file_paths:
    anno_df = pd.read_table(p, header=None)
    label_names = anno_df.iloc[:,2]
    anno_df.columns = ["Start", "Stop", "Label"]
    anno_df["Duration"] = anno_df["Stop"] - anno_df["Start"]
    anno_df["File"] = pathlib.Path(p).stem.split("_labels")[0]
    anno_df["Salience"] = [label.split("C")[1].split("-")[0] for label in anno_df["Label"].values]
    anno_df["Label"] = [label.split("-")[1] for label in anno_df["Label"].values]
    records = anno_df.to_dict("records")
    all_records.extend(records)

In [86]:
label_timing_df = pd.DataFrame(all_records)

In [93]:
summary_dfs = []
summary_dfs.append(
    label_timing_df.groupby(["Label"])
    .sum()
    .loc[:, ["Duration"]]
    .rename(columns={"Duration": "Total duration"})
)
summary_dfs.append(
    label_timing_df.groupby(["Label"])
    .mean()
    .loc[:, ["Duration"]]
    .rename(columns={"Duration": "Mean duration"})
)
summary_dfs.append(
    label_timing_df.groupby(["Label"])
    .count()
    .loc[:, ["Duration"]]
    .rename(columns={"Duration": "Count of label instances"})
)
summary_dfs.append(
    label_timing_df.drop_duplicates(subset=["Label", "File"])
    .groupby(["Label"])
    .count()
    .loc[:, ["Duration"]]
    .rename(columns={"Duration": "Count of files with label"})
)
summary_df = pd.concat(summary_dfs, axis=1).sort_values(by=["Count of files with label"], ascending=False)

In [97]:
with open("summary_annotations.md", "w") as f:
    f.write(summary_df.reset_index().to_markdown())

In [118]:
summary_table = summary_df.reset_index().to_markdown()

In [103]:
import re

In [110]:
old_readme

'# miso-sound-annotate\n\nGoal: annotating 5+ instances of 10+ audio files (e.g. onset & offset of sound of interest, salience)\n\nSound Search Steps:\n- Populate "current_term_present" and "other_term_present" columns with "yes", "no", or "unsure". Populate "notes" with notable descriptions and any other present sounds. \n- Populate "background_noise_present" column. Use "yes", "no", or "unsure" labels. \n- Continue process for given class until 5 instances have "no" under "background_noise_present", then move to next class. \n\nAudacity Labeling Steps:\n- Download each of the 5 sounds from 10 classes. \n- Highlight and label each sound in audacity (including the "other_term_present" labels). \n- file>export>export labels\n- Save .txt file as "NumberAtTheEndOfURL_labels.txt" (Ex. "180150_labels.txt"). \n\nTaxonomy Labeling Steps:\n- Add each label created in Audacity to the JSON viewer or JSON file directly in GitHub. Use appropriate parent sounds. \n- Commit changes to GitHub.\n\n<!-

In [128]:
with open(pathlib.Path(package_dir,"README.md")) as f:
    old_readme = f.read()

start_str ="start_sync_summary_table"
stop_str = "stop_sync_summary_table"
end_comment_str = "\n-->\n"
start_comment_str = "\n<!---\n"
result = re.findall(r"" + start_str + "(.*?)" + stop_str, old_readme, re.DOTALL|re.MULTILINE)[0]
old_replace_content = start_str + result + stop_str
new_replace_content = start_str + end_comment_str + summary_table + start_comment_str + stop_str
new_readme = old_readme.replace(old_replace_content, new_replace_content)

with open(pathlib.Path(package_dir,"README.md"), "w") as f:
    f.write(new_readme)

In [129]:
new_readme

'# miso-sound-annotate\n\nGoal: annotating 5+ instances of 10+ audio files (e.g. onset & offset of sound of interest, salience)\n\nSound Search Steps:\n- Populate "current_term_present" and "other_term_present" columns with "yes", "no", or "unsure". Populate "notes" with notable descriptions and any other present sounds. \n- Populate "background_noise_present" column. Use "yes", "no", or "unsure" labels. \n- Continue process for given class until 5 instances have "no" under "background_noise_present", then move to next class. \n\nAudacity Labeling Steps:\n- Download each of the 5 sounds from 10 classes. \n- Highlight and label each sound in audacity (including the "other_term_present" labels). \n- file>export>export labels\n- Save .txt file as "NumberAtTheEndOfURL_labels.txt" (Ex. "180150_labels.txt"). \n\nTaxonomy Labeling Steps:\n- Add each label created in Audacity to the JSON viewer or JSON file directly in GitHub. Use appropriate parent sounds. \n- Commit changes to GitHub.\n\n<!-

In [117]:
result

'\n-->\n\n<!---\n'

In [44]:
import plotly.express as px
px.box(label_timing_df, x="Label", y="Duration")

In [43]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
labels = label_timing_df["Label"].unique()
fig = make_subplots(rows=len(labels), cols=1)
for i in range(len(labels)):
    label = labels[i]
    trace0 = go.Histogram(x=label_timing_df[label_timing_df["Label"]==label]["Duration"], nbinsx=10, name=label)
    fig.append_trace(trace0, i+1, 1)

fig.show()

In [42]:
i

0