In [1]:
from pathlib import Path
import csv
from itertools import groupby
import h5py
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import random
import matplotlib
from matplotlib import pyplot as plt

# Configuration & Utilities

In [2]:
random.seed(42)
COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
ANNOTATIONS_PATH = "/media/xtrem/data/experiments/nicolingua-0001-language-id/language-id-annotations/metadata.csv"
FEATURE_DIRS = [
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/wav2vec_features-c',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/wav2vec_features-z',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/retrained-wav2vec_features-c',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/retrained-wav2vec_features-z'
]

In [3]:
classification_taxonomy = {
    #('utt-verbal-nod', 100),
    #('utt-multi-lingual-named-endity', 6),
    #('utt-multi-lingual', 53),
    0: {
        "group_name": "Utterance Characteristics",
        "classes": {
            0: {
                "class_name": "Verbal Nod",
                "tags": {'utt-verbal-nod'}
            },
            1: {
                "class_name": "Multilingual Utterance",
                "tags": {'utt-multi-lingual-named-endity', 'utt-multi-lingual'}
            }
        }
    },
    
    
    #('spkr-single', 108),
    #('spkr-multi', 159),
    #('spkr-mult', 1),
    1: {
        "group_name": "Speaker Count",
        "classes": {
            0: {
                "class_name": "Single Speaker",
                "tags": {'spkr-single'}
            },
            1: {
                "class_name": "Multiple Speakers",
                "tags": {'spkr-multi', 'spkr-mult'}
            }
        }
    },
    
    
    # ('spkr-male', 236),
    # ('spkr-female', 80),
    2: {
        "group_name": "Speakers Genders",
        "classes": {
            0: {
                "class_name": "Male Speaker",
                "tags": {'spkr-male'}
            },
            1: {
                "class_name": "Female Speaker",
                "tags": {'spkr-female'}
            }
        }
    },
    
    
    
    # ('lng-toma', 1),
    # ('lng-spanish', 2),
    # ('lng-kisi', 8),
    # ('lng-english', 2),
    
    #  ('lng-arabic', 11),
    #  ('lng-french', 73),
    #  ('lng-guerze', 13),
    #  ('lng-koniaka', 19),
    #  ('lng-maninka', 123),
    #  ('lng-pular', 34),
    #  ('lng-susu', 41),
    # ('lng-unknown', 54),
    3: {
        "group_name": "Language",
        "classes": {
            0: {
                "class_name": "Arabic Language",
                "tags": {'lng-arabic'}
            },
            1: {
                "class_name": "French Language",
                "tags": {'lng-french'}
            },
            2: {
                "class_name": "Guerze Language",
                "tags": {'lng-guerze'}
            },
            3: {
                "class_name": "Koniaka Language",
                "tags": {'lng-koniaka'}
            },
            4: {
                "class_name": "Maninka Language",
                "tags": {'lng-maninka'}
            },
            5: {
                "class_name": "Pular Language",
                "tags": {'lng-pular'}
            },
            6: {
                "class_name": "Susu Language",
                "tags": {'lng-susu'}
            },
            7: {
                "class_name": "Unknown Language",
                "tags": {'lng-unknown'}
            }
        }
    },
    
    
    # ('ct-speech', 216),
    # ('ct-song', 95),
    # ('ct-laughter', 31),
    4: {
        "group_name": "Vocalization Type",
        "classes": {
            0: {
                "class_name": "Speech",
                "tags": {'ct-speech'}
            },
            1: {
                "class_name": "Song",
                "tags": {'ct-song'}
            },
            2: {
                "class_name": "Laughter",
                "tags": {'ct-laughter'}
            }
        }
    },
    
    
    # ('ct-telephone', 55),
    # ('ct-noise', 57),
    5: {
        "group_name": "Channel Characteristics",
        "classes": {
            0: {
                "class_name": "Telephone",
                "tags": {'ct-telephone'}
            },
            1: {
                "class_name": "Noise",
                "tags": {'ct-noise'}
            }
        }
    },
    
    
    # ('ct-tr-music', 5),
    # ('ct-fg-music', 100),
    # ('ct-bg-music', 64)
    6: {
        "group_name": "Music",
        "classes": {
            0: {
                "class_name": "Foreground Music",
                "tags": {'ct-fg-music', 'ct-tr-music'}
            },
            1: {
                "class_name": "Background Music",
                "tags": {'ct-bg-music'}
            }
        }
    },
    
    
    # ('ct-edu-islam', 16),
    # ('ct-edu-covid', 4),
    7: {
        "group_name": "Topic",
        "classes": {
            0: {
                "class_name": "Islamic Education",
                "tags": {'ct-edu-islam'}
            }
        }
    }
    
}

# Load annotations

In [4]:
def get_classification_labels(tag_set):
    labels = []
    for group_id in classification_taxonomy:
        for class_id in classification_taxonomy[group_id]['classes']:
            belongs = len(tag_set.intersection(
                classification_taxonomy[group_id]['classes'][class_id]['tags']
            )) > 0
            if belongs:
                labels.append((group_id, class_id))
    return labels

In [5]:
tags_by_file = {}
def load_annotations(a_file_path, classification_taxonomy):
    with open(ANNOTATIONS_PATH) as f:
        reader = csv.DictReader(f)
        for row in reader:
            tag_set = set([t.strip() for t in row['tags'].split(";")])
            tags_by_file[row['file']] = tag_set
            
            labels = get_classification_labels(tag_set)
            yield row['file'], labels


data = list(load_annotations(ANNOTATIONS_PATH, classification_taxonomy))
random.shuffle(data)
audio_files, audio_labels = zip(*data)

In [6]:
audio_labels

([(0, 0), (1, 1), (2, 0), (3, 6), (4, 0), (5, 0), (5, 1)],
 [(0, 0), (0, 1), (1, 1), (2, 0), (3, 1), (3, 4), (4, 0)],
 [(1, 0), (2, 0), (3, 1), (4, 0), (6, 1)],
 [(1, 0), (2, 1), (3, 7), (4, 0), (6, 1)],
 [(1, 1), (2, 0), (3, 0), (3, 1), (3, 6), (4, 0), (4, 1), (5, 1), (6, 0)],
 [(0, 0), (1, 1), (2, 0), (3, 5), (4, 0)],
 [(1, 1), (2, 0), (2, 1), (3, 1), (4, 0), (5, 1)],
 [(1, 0), (2, 0), (3, 6), (4, 0), (5, 1)],
 [(1, 0), (2, 0), (3, 6), (4, 0), (7, 0)],
 [(0, 1), (1, 0), (2, 0), (3, 1), (3, 4), (4, 0), (6, 1)],
 [(0, 1), (1, 0), (3, 1), (3, 7), (4, 1), (6, 0)],
 [(0, 1), (1, 1), (2, 0), (3, 1), (3, 4), (4, 0), (5, 0), (7, 0)],
 [(0, 0), (1, 1), (2, 0), (3, 3), (4, 0)],
 [(0, 0), (1, 1), (2, 0), (3, 4), (4, 0), (5, 1), (6, 0), (6, 1)],
 [(0, 0), (1, 1), (2, 0), (3, 5), (4, 0), (5, 0), (5, 1)],
 [(0, 1), (1, 1), (2, 0), (3, 1), (3, 4), (4, 0), (4, 2), (5, 0), (5, 1)],
 [(0, 1), (1, 0), (2, 0), (3, 1), (3, 2), (4, 0)],
 [(0, 1),
  (1, 1),
  (2, 0),
  (2, 1),
  (3, 1),
  (3, 3),
  (3, 4),

In [30]:
def classification_taxonomy_stats_to_markdown():
    print(f"|Group|Class|Tags|Tnstances (out of {len(audio_labels)})|")
    print("|--|--|--|--|")
    for group_id in sorted(classification_taxonomy.keys()):
        group = classification_taxonomy[group_id]
        #print(f"### Group-{group_id}: {group['group_name']}")
        for class_id in sorted(group['classes'].keys()):
            the_class = group['classes'][class_id]
            tag_list = ", ".join(sorted(group['classes'][class_id]['tags']))
            instance_count = len([e for e in audio_labels if (group_id, class_id) in e])
            #print(f"- Class-{class_id}: {the_class['class_name']} ({tag_list}) - {instance_count}/{len(audio_labels)}")
            
            str_group = f"{group_id}-{group['group_name']}"
            str_class = f"{class_id}-{the_class['class_name']}"
            str_tags = ", ".join(sorted(group['classes'][class_id]['tags']))
            str_instances = f"{instance_count}"
            print(f"|{str_group}|{str_class}|{str_tags}|{str_instances}|")
            
        print("|_____|_____|_____|_____|")


classification_taxonomy_stats_to_markdown()

|Group|Class|Tags|Tnstances (out of 300)|
|--|--|--|--|
|0-Utterance Characteristics|0-Verbal Nod|utt-verbal-nod|100|
|0-Utterance Characteristics|1-Multilingual Utterance|utt-multi-lingual, utt-multi-lingual-named-endity|59|
|_____|_____|_____|_____|
|1-Speaker Count|0-Single Speaker|spkr-single|108|
|1-Speaker Count|1-Multiple Speakers|spkr-mult, spkr-multi|160|
|_____|_____|_____|_____|
|2-Speakers Genders|0-Male Speaker|spkr-male|236|
|2-Speakers Genders|1-Female Speaker|spkr-female|80|
|_____|_____|_____|_____|
|3-Language|0-Arabic Language|lng-arabic|11|
|3-Language|1-French Language|lng-french|73|
|3-Language|2-Guerze Language|lng-guerze|13|
|3-Language|3-Koniaka Language|lng-koniaka|19|
|3-Language|4-Maninka Language|lng-maninka|123|
|3-Language|5-Pular Language|lng-pular|34|
|3-Language|6-Susu Language|lng-susu|41|
|3-Language|7-Unknown Language|lng-unknown|54|
|_____|_____|_____|_____|
|4-Vocalization Type|0-Speech|ct-speech|216|
|4-Vocalization Type|1-Song|ct-song|95|
|4-Voc

In [8]:
# Train test splits
- Use 5 folds cross validation
- Shuffle data and split in 5 groups
- 5 times, pick train on 4 groups and validate on 1 group. Average/stdev classification accuracies



SyntaxError: invalid syntax (<ipython-input-8-9e90e567be57>, line 2)

# Multilabel classification neural network
- main convolution trunk
- convolution branches per label group
- Sigmoid output scores
- Binary cross entropy losses


In [None]:
300 - 300/5