In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

In [8]:
PROJECT_ROOT = os.path.join(os.getcwd(), '..')
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
RAW_DATA_SUBDIR = os.path.join(DATA_DIR, 'raw')
PROCESSED_DATA_SUBDIR = os.path.join(DATA_DIR, 'processed')
sys.path.append(PROJECT_ROOT)

## DATA CLEANING

The code above has been refactored into reusable functions in the `parsing.py` module. Here's how to use them:

EXAMPLE ANNOTATION FILE FORMAT:

|Selection	| View          | Channel | Begin Time (s) | 	End Time (s)	| Low Freq (Hz)	| High Freq (Hz) | Inband Power (dB FS)	| Species	| Call type	| Rating    | Reference |
|-----------|---------------|---------|----------------|--------------------|---------------|----------------|----------------------|-----------|-----------|-----------|-----------|
|1	        | Spectrogram 1 | 1	      | 1.789365731	   | 1.892598370	    | 5401.800	    | 7438.600	     | -29.01	            | LW	    | AA	    | A	        |           |
|2	        | Spectrogram 1 | 1	      | 2.691422357	   | 2.794654995	    | 5401.800	    | 7438.600	     | -60.31	            | 	        | noise		|           | 1         |
|3	        | Spectrogram 1 | 1	      | 20.557990213   | 20.631727812	    | 6110.200	    | 8589.800	     | -33.82	            | LW	    | AA	    | A	        |           |
|4	        | Spectrogram 1 | 1	      | 26.138457751   | 26.212195350	    | 6110.200	    | 8589.800	     | -58.35	            | 	        | noise		|           | 3         |

COLUMNS TO KEEP:
Begin Time (s), End Time (s), Low Freq (Hz), High Freq (Hz), Inband Power (dB FS), Species, Call type, Rating


In [9]:
# Import the parsing utilities
from src.banana_net.utils.parsing import (
    load_and_process_annotations,
    print_dataset_summary,
    save_processed_dataset
)

all_annotations = []

# Process the same data using the new functions
for dir_name in os.listdir(RAW_DATA_SUBDIR):
    dir_path = os.path.join(RAW_DATA_SUBDIR, dir_name)
    print(f"Processing directory: {dir_path}")
    try:
        processed_dataset = load_and_process_annotations(
            dir_path,
        )
        all_annotations.append(processed_dataset)
    except Exception as e:
        print(f"Error processing {dir_path}: {e}")
        continue

# Combine all processed datasets into one DataFrame
processed_dataset = pd.concat(all_annotations, ignore_index=True)

Processing directory: /home/nhrot-fc/Programming/Python/banana_net/notebooks/../data/raw/bolivian_squirrel_monkey__SB
Processing directory: /home/nhrot-fc/Programming/Python/banana_net/notebooks/../data/raw/howler_monkey__AS
Error processing 20240120_053202.txt: "['Inband Power (dB FS)'] not in index"
Error processing 20240120_065845.txt: No columns to parse from file
Processing directory: /home/nhrot-fc/Programming/Python/banana_net/notebooks/../data/raw/large_headed_capuchin__SM
Processing directory: /home/nhrot-fc/Programming/Python/banana_net/notebooks/../data/raw/night_monkey__AA
Error processing 20240609_174227.txt: 'utf-8' codec can't decode byte 0x80 in position 37: invalid start byte
Error processing 20240609_174409.txt: 'utf-8' codec can't decode byte 0x80 in position 37: invalid start byte
Processing directory: /home/nhrot-fc/Programming/Python/banana_net/notebooks/../data/raw/peruvian_spider_monkey__AC
Error processing 20240409_064044.txt: No columns to parse from file
Proc

In [10]:
unique_species = processed_dataset['specie'].unique()
unique_call_types = processed_dataset['call_type'].unique()
print("\nUnique Species: ", unique_species)
print("\nUnique Call Types: ", unique_call_types)

# Load the audio files to check for typos
SPECIES_TO_CHECK = ['as---------------------as', 'l', '}', '#', '8', '-', 'an', 'sn']
df_check = processed_dataset[processed_dataset['specie'].isin(SPECIES_TO_CHECK)].copy(deep=True)

# Correct the typos in the species names using the directory column
# the directory names are the correct species names the last 2 characters of the directory name
df_check['specie'] = df_check['directory'].str[-2:].str.lower()

# Replace the corrected species names in the main DataFrame
for index, row in df_check.iterrows():
    processed_dataset.loc[index, 'specie'] = row['specie']
    
# Print the corrected DataFrame for verification
print("\nCorrected Unique Species: ", processed_dataset['specie'].unique())
print("\nCorrected Call Types: ", processed_dataset['call_type'].unique())


Unique Species:  ['sb' 'sm' 'as' 'ac' '}' 'as---------------------as' 'sn' 'lw' '#' 'aa'
 'an' 'cc' 'pt' '8' '-']

Unique Call Types:  ['lpc' 'ppc' 'pcc' 'spc' 'sc' 'cc' 'pcs' 'hic' 'phc' 'cp' 'ip' 'pp' 'hc'
 'pc' 'bc' 'chc' 'acc' 'whc' 'fs' 'fc' 'sic' 'gc' 'hm' 'hf' 'sqc'
 'whinnie' 'chcj' 'bp' 'ac' 'dc' 'd' 'cs' 'aa' 'a' 'b' 'ta' 'c' 'tt' 'tr'
 'vc' 'tj' 'tc' 'tf' 't' 'php' 'sqr' 'lw' 'tac' 'tca']

Corrected Unique Species:  ['sb' 'sm' 'as' 'ac' 'lw' 'aa' 'cc' 'pt']

Corrected Call Types:  ['lpc' 'ppc' 'pcc' 'spc' 'sc' 'cc' 'pcs' 'hic' 'phc' 'cp' 'ip' 'pp' 'hc'
 'pc' 'bc' 'chc' 'acc' 'whc' 'fs' 'fc' 'sic' 'gc' 'hm' 'hf' 'sqc'
 'whinnie' 'chcj' 'bp' 'ac' 'dc' 'd' 'cs' 'aa' 'a' 'b' 'ta' 'c' 'tt' 'tr'
 'vc' 'tj' 'tc' 'tf' 't' 'php' 'sqr' 'lw' 'tac' 'tca']


In [11]:
from Levenshtein import distance as levenshtein_distance


def find_potential_typos_per_species(df, species_list, distance_threshold=2):
    """
    Identifies potential typos in uncommon call types by comparing them
    to common call types within the same species using Levenshtein distance.
    """
    potential_typos_report = {}

    for species in species_list:
        species_df = df[df['specie'] == species]
        if species_df.empty:
            print(f"\nNo data for species: {species}")
            continue

        call_type_counts = species_df['call_type'].value_counts()
        uncommon_calls_series = call_type_counts[call_type_counts < 50]
        common_calls_list = call_type_counts[call_type_counts >= 50].index.tolist()

        potential_typos_report[species] = []

        if not common_calls_list:
            print(f"\nNo common call types (>= 200 occurrences) for species: {species} to compare against.")
            # Optionally, list uncommon ones if no common ones exist, though comparison isn't possible
            if not uncommon_calls_series.empty:
                print(f"Uncommon call types for {species} (cannot check for typos without common ones):")
                for call, count in uncommon_calls_series.items():
                    print(f"  - {call} (Count: {count})")
            continue

        if uncommon_calls_series.empty:
            print(f"\nNo uncommon call types (< 200 occurrences) for species: {species}.")
            continue

        print(f"\n--- Species: {species} ---")
        print(f"Common call types: {common_calls_list}")
        print("Checking uncommon call types for potential typos:")

        for uncommon_call, u_count in uncommon_calls_series.items():
            found_similarity = False
            for common_call in common_calls_list:
                dist = levenshtein_distance(str(uncommon_call), str(common_call))
                if 0 < dist <= distance_threshold: # 0 means identical, already handled by count logic
                    similarity_info = {
                        'uncommon_call': uncommon_call,
                        'uncommon_count': u_count,
                        'similar_to_common': common_call,
                        'levenshtein_distance': dist
                    }
                    potential_typos_report[species].append(similarity_info)
                    print(f"  Potential typo: '{uncommon_call}' (Count: {u_count}) is similar to common call '{common_call}' (Distance: {dist})")
                    found_similarity = True
            if not found_similarity:
                print(f"  Uncommon: '{uncommon_call}' (Count: {u_count}) - No close common call type found within threshold.")
                # Add it to report as checked but not similar if needed, or just ignore
                # For now, we only report actual potential typos.

    return potential_typos_report


In [12]:
levenshtein_threshold = 1

potential_typos_report = find_potential_typos_per_species(
    processed_dataset,
    processed_dataset['specie'].unique(),
    distance_threshold=levenshtein_threshold
)

print("\n\n--- Summary of Potential Typos ---")
any_potential_typos_found = False
for species, typos in potential_typos_report.items():
    if typos:
        any_potential_typos_found = True
        print(f"\nSpecies: {species}")
        for typo_info in typos:
            print(f"  - Uncommon '{typo_info['uncommon_call']}' (Count: {typo_info['uncommon_count']}) "
                    f"might be a typo of common '{typo_info['similar_to_common']}' "
                    f"(Distance: {typo_info['levenshtein_distance']})")
if not any_potential_typos_found:
    print("No potential typos found based on the criteria and threshold.")


--- Species: sb ---
Common call types: ['ppc', 'spc', 'pcc', 'sc']
Checking uncommon call types for potential typos:
  Potential typo: 'lpc' (Count: 49) is similar to common call 'ppc' (Distance: 1)
  Potential typo: 'lpc' (Count: 49) is similar to common call 'spc' (Distance: 1)
  Potential typo: 'pcs' (Count: 8) is similar to common call 'pcc' (Distance: 1)
  Potential typo: 'phc' (Count: 3) is similar to common call 'ppc' (Distance: 1)
  Potential typo: 'phc' (Count: 3) is similar to common call 'pcc' (Distance: 1)
  Potential typo: 'cc' (Count: 1) is similar to common call 'pcc' (Distance: 1)
  Potential typo: 'cc' (Count: 1) is similar to common call 'sc' (Distance: 1)
  Uncommon: 'hic' (Count: 1) - No close common call type found within threshold.

--- Species: sm ---
Common call types: ['cc', 'fs', 'hic', 'pc', 'sc', 'fc', 'whc']
Checking uncommon call types for potential typos:
  Potential typo: 'ppc' (Count: 11) is similar to common call 'pc' (Distance: 1)
  Potential typo: '

In [13]:
# Drop uncommon combinations species and call_type 
UNCOMMON_THRESHOLD = 200
processed_dataset = processed_dataset.groupby(['specie', 'call_type']).filter(
    lambda x: len(x) >= UNCOMMON_THRESHOLD
)

# Finally Order the dataset by specie -> recording_file -> begin_time
processed_dataset = processed_dataset.sort_values(
    by=['specie', 'recording_file', 'begin_time'],
    ascending=[True, True, True]
).reset_index(drop=True) 

# Save the processed dataset
save_processed_dataset(
    processed_dataset,
    os.path.join(PROCESSED_DATA_SUBDIR, 'processed_dataset.csv')
)

Dataset saved to: /home/nhrot-fc/Programming/Python/banana_net/notebooks/../data/processed/processed_dataset.csv


## DATA TRANSFORMATION

### Example: Process Weddell's Tamarin data

| index | begin_time | end_time	| low_freq	| high_freq	| inband_power	| species	| call_type |	recording_file      |
|-------|------------|----------|-----------|-----------|---------------|-----------|-----------|-----------------------|
| 0	    | 1.336330	 | 1.865889	| 7560.000	| 10080.000 |	-46.04	    | lw	    | cs        |	20240117_162607.wav |
| 1	    | 1.950137	 | 2.449607	| 7766.599	| 10801.822 |	-41.65	    | lw	    | cs        |	20240117_162607.wav |
| 2	    | 2.527837	 | 3.057396	| 8212.955	| 10444.737 |	-41.20	    | lw	    | cs        |	20240117_162607.wav |
| 3	    | 3.135626	 | 3.611026	| 8034.413	| 10266.194 |	-40.34	    | lw	    | cs        |	20240117_162607.wav |
| 4	    | 3.767486	 | 4.182708	| 7588.057	| 9552.024  |	-41.43	    | lw	    | cs        |	20240117_162607.wav |

YOLO-like tensor format:
Our system models detection as a regres-
sion problem. It divides the image into an S x S grid and for each
grid cell predicts B bounding boxes, confidence for those boxes,
and C class probabilities. These predictions are encoded as an
S x S x (B * 5 | C) tensor.

The 5 in the B * 5 term corresponds to the bounding box coordinates (x, y, width, height) and the confidence score.

In [14]:
DATASET_PATH = os.path.join(PROCESSED_DATA_SUBDIR, 'processed_dataset.csv')

call_data = pd.read_csv(DATASET_PATH)

call_data.head(10)

Unnamed: 0,begin_time,end_time,low_freq,high_freq,inband_power,specie,call_type,recording_file,directory
0,0.698818,0.772731,0.0,3739.187,-70.48,aa,gc,20240117_051057.wav,night_monkey__AA
1,1.211587,1.282141,70.551,4338.868,-72.05,aa,gc,20240117_051057.wav,night_monkey__AA
2,1.727301,1.796175,70.551,4338.868,-68.17,aa,gc,20240117_051057.wav,night_monkey__AA
3,3.330185,3.395699,105.826,4444.694,-71.47,aa,gc,20240117_051057.wav,night_monkey__AA
4,3.861571,3.940524,176.377,4338.868,-73.33,aa,gc,20240117_051057.wav,night_monkey__AA
5,4.382325,4.466317,176.377,4233.042,-73.62,aa,gc,20240117_051057.wav,night_monkey__AA
6,4.967517,5.033031,105.826,4127.216,-70.46,aa,gc,20240117_051057.wav,night_monkey__AA
7,6.105379,6.170894,70.551,4621.071,-72.8,aa,gc,20240117_051057.wav,night_monkey__AA
8,6.527022,6.597576,35.275,4303.593,-68.81,aa,gc,20240117_051057.wav,night_monkey__AA
9,6.936906,7.019218,35.275,5009.1,-67.18,aa,gc,20240117_051057.wav,night_monkey__AA
