In [2]:
from pathlib import Path
import csv
from itertools import groupby
import h5py
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import random
import matplotlib
from matplotlib import pyplot as plt

# Configuration & Utilities

In [3]:
random.seed(42)
COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
ANNOTATIONS_PATH = "/media/xtrem/data/experiments/nicolingua-0001-language-id/language-id-annotations/metadata.csv"
FEATURE_DIRS = [
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/wav2vec_features-c',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/wav2vec_features-z',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/retrained-wav2vec_features-c',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/retrained-wav2vec_features-z'
]

In [51]:
classification_taxonomy = {
    #('utt-verbal-nod', 100),
    #('utt-multi-lingual-named-endity', 6),
    #('utt-multi-lingual', 53),
    0: {
        "group_name": "Utterance Characteristics",
        "classes": {
            0: {
                "class_name": "utt-verbal-nod",
                "tags": {'utt-verbal-nod'}
            },
            1: {
                "class_name": "utt-multi-lingual",
                "tags": {'utt-multi-lingual-named-endity', 'utt-multi-lingual'}
            }
        }
    },
    
    
    #('spkr-single', 108),
    #('spkr-multi', 159),
    #('spkr-mult', 1),
    1: {
        "group_name": "Speaker Count",
        "classes": {
            0: {
                "class_name": "spkr-single",
                "tags": {'spkr-single'}
            },
            1: {
                "class_name": "spkr-multi",
                "tags": {'spkr-multi', 'spkr-mult'}
            }
        }
    },
    
    
    # ('spkr-male', 236),
    # ('spkr-female', 80),
    2: {
        "group_name": "Speakers Genders",
        "classes": {
            0: {
                "class_name": "spkr-male",
                "tags": {'spkr-male'}
            },
            1: {
                "class_name": "spkr-female",
                "tags": {'spkr-female'}
            }
        }
    },
    
    
    
    # ('lng-toma', 1),
    # ('lng-spanish', 2),
    # ('lng-kisi', 8),
    # ('lng-english', 2),
    
    #  ('lng-arabic', 11),
    #  ('lng-french', 73),
    #  ('lng-guerze', 13),
    #  ('lng-koniaka', 19),
    #  ('lng-maninka', 123),
    #  ('lng-pular', 34),
    #  ('lng-susu', 41),
    # ('lng-unknown', 54),
    3: {
        "group_name": "Language",
        "classes": {
            0: {
                "class_name": "lng-arabic",
                "tags": {'lng-arabic'}
            },
            1: {
                "class_name": "lng-french",
                "tags": {'lng-french'}
            },
            2: {
                "class_name": "lng-guerze",
                "tags": {'lng-guerze'}
            },
            3: {
                "class_name": "lng-koniaka",
                "tags": {'lng-koniaka'}
            },
            4: {
                "class_name": "lng-maninka",
                "tags": {'lng-maninka'}
            },
            5: {
                "class_name": "lng-pular",
                "tags": {'lng-pular'}
            },
            6: {
                "class_name": "lng-susu",
                "tags": {'lng-susu'}
            },
            7: {
                "class_name": "lng-unknown",
                "tags": {'lng-unknown'}
            }
        }
    },
    
    
    # ('ct-speech', 216),
    # ('ct-song', 95),
    # ('ct-laughter', 31),
    4: {
        "group_name": "Vocalization Type",
        "classes": {
            0: {
                "class_name": "ct-speech",
                "tags": {'ct-speech'}
            },
            1: {
                "class_name": "ct-song",
                "tags": {'ct-song'}
            },
            2: {
                "class_name": "ct-laughter",
                "tags": {'ct-laughter'}
            }
        }
    },
    
    
    # ('ct-telephone', 55),
    # ('ct-noise', 57),
    5: {
        "group_name": "Channel Characteristics",
        "classes": {
            0: {
                "class_name": "ct-telephone",
                "tags": {'ct-telephone'}
            },
            1: {
                "class_name": "ct-noise",
                "tags": {'ct-noise'}
            }
        }
    },
    
    
    # ('ct-tr-music', 5),
    # ('ct-fg-music', 100),
    # ('ct-bg-music', 64)
    6: {
        "group_name": "Music",
        "classes": {
            0: {
                "class_name": "ct-fg-music",
                "tags": {'ct-fg-music', 'ct-tr-music'}
            },
            1: {
                "class_name": "ct-bg-music",
                "tags": {'ct-bg-music'}
            }
        }
    },
    
    
    # ('ct-edu-islam', 16),
    # ('ct-edu-covid', 4),
    7: {
        "group_name": "Topic",
        "classes": {
            0: {
                "class_name": "ct-edu-islam",
                "tags": {'ct-edu-islam'}
            }
        }
    }
    
}

In [52]:
def to_user_friendly_feature_name(fv_name):
    name = fv_name \
        .replace("features-", "") \
        .replace("wav2vec_", "") \
        .replace("average", "avg") \
        .replace("timestep", "T") \
        .replace("c.", "Context") \
        .replace("z.", "Latent")
    return name

# Load annotations

In [53]:
def get_classification_labels(tag_set):
    labels = []
    for group_id in classification_taxonomy:
        for class_id in classification_taxonomy[group_id]['classes']:
            belongs = len(tag_set.intersection(
                classification_taxonomy[group_id]['classes'][class_id]['tags']
            )) > 0
            if belongs:
                labels.append((group_id, class_id))
    return labels

In [54]:
tags_by_file = {}
def load_annotations(a_file_path, a_specification):
    with open(ANNOTATIONS_PATH) as f:
        reader = csv.DictReader(f)
        for row in reader:
            tag_set = set([t.strip() for t in row['tags'].split(";")])
            tags_by_file[row['file']] = tag_set
            
            labels = get_classification_labels(tag_set)
            yield row['file'], labels


data = list(load_annotations(ANNOTATIONS_PATH, annotation_specification))
random.shuffle(data)
audio_files, audio_labels = zip(*data)

In [55]:
[d for d in data if len(d[1]) == 0]

[]

In [56]:
tags_by_file['0cb889b0-1152-41bf-abf9-cf82841f9d42.wav']

{'lng-unknown'}

## Inspect label counts

In [7]:
def inspect_label_counts():
    for label in annotation_specification:
        count = len([l for l in audio_labels if l == label])
        print("{:10} ({}): {}".format(
            annotation_specification[label]['label'],
            label, 
            count
        ))
inspect_label_counts()

maninka    (0): 114
susu       (1): 32
pular      (2): 28


## Inspect other tags

In [15]:
import random
tags_by_file = {}
tags_counts = {}

with open(ANNOTATIONS_PATH) as f:
    reader = csv.DictReader(f)
    for row in reader:
        tag_set = set([t.strip() for t in row['tags'].split(";")])
        tags_by_file[row['file']] = tag_set
        for tag in tag_set:
            if tag not in tags_counts:
                tags_counts[tag] = 0
            tags_counts[tag] += 1

sorted(tags_counts.items(), key=lambda i: i[0], reverse=True)


[('utt-verbal-nod', 100),
 ('utt-multi-lingual-named-endity', 6),
 ('utt-multi-lingual', 53),
 ('spkr-single', 108),
 ('spkr-multi', 159),
 ('spkr-mult', 1),
 ('spkr-male', 236),
 ('spkr-female', 80),
 ('lng-unknown', 54),
 ('lng-toma', 1),
 ('lng-susu', 41),
 ('lng-spanish', 2),
 ('lng-pular', 34),
 ('lng-maninka', 123),
 ('lng-koniaka', 19),
 ('lng-kisi', 8),
 ('lng-guerze', 13),
 ('lng-french', 73),
 ('lng-english', 2),
 ('lng-arabic', 11),
 ('ct-tr-music', 5),
 ('ct-telephone', 55),
 ('ct-speech', 216),
 ('ct-song', 95),
 ('ct-noise', 57),
 ('ct-laughter', 31),
 ('ct-fg-music', 100),
 ('ct-edu-islam', 16),
 ('ct-edu-covid', 4),
 ('ct-bg-music', 64)]