In [1]:
from pathlib import Path
import csv
from itertools import groupby
import h5py
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn.svm import SVC
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import random
import matplotlib
from matplotlib import pyplot as plt

# Configuration & Utilities

In [2]:
random.seed(42)
COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
ANNOTATIONS_PATH = "/media/xtrem/data/experiments/nicolingua-0001-language-id/language-id-annotations/metadata.csv"
FEATURE_DIRS = [
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/wav2vec_features-c',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/wav2vec_features-z',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/retrained-wav2vec_features-c',
    '/media/xtrem/data/experiments/nicolingua-0001-language-id/retrained-wav2vec_features-z'
]

In [3]:
annotation_specification = {
    0: {
        'id': 0,
        'label': "maninka",
        'required_tags': set(['ct-speech', 'lng-maninka']),
        'forbidden_tags':  set(['lng-susu', 'lng-pular'])
    },
    1: {
        'id': 1,
        'label': "susu",
        'required_tags': set(['ct-speech', 'lng-susu']),
        'forbidden_tags':  set(['lng-maninka', 'lng-pular'])
    },
    2: {
        'id': 2,
        'label': "pular",
        'required_tags': set(['ct-speech', 'lng-pular']),
        'forbidden_tags':  set(['lng-susu', 'lng-maninka'])
    }
}

In [4]:
def to_user_friendly_feature_name(fv_name):
    name = fv_name \
        .replace("features-", "") \
        .replace("wav2vec_", "") \
        .replace("average", "avg") \
        .replace("timestep", "T") \
        .replace("c.", "Context") \
        .replace("z.", "Latent")
    return name

# Load annotations

In [5]:
def load_annotations(a_file_path, a_specification):
    with open(ANNOTATIONS_PATH) as f:
        reader = csv.DictReader(f)
        for row in reader:
            tag_set = set([t.strip() for t in row['tags'].split(";")])
            for label in annotation_specification.keys():
                spec = annotation_specification[label]
                if spec['required_tags'].issubset(tag_set):
                    if spec['forbidden_tags'].isdisjoint(tag_set):
                        yield row['file'], label
                        break

data = list(load_annotations(ANNOTATIONS_PATH, annotation_specification))
audio_files, audio_labels = zip(*data)

## Inspect label counts

In [6]:
def inspect_label_counts():
    for label in annotation_specification:
        count = len([l for l in audio_labels if l == label])
        print("{:10} ({}): {}".format(
            annotation_specification[label]['label'],
            label, 
            count
        ))
inspect_label_counts()

maninka    (0): 114
susu       (1): 32
pular      (2): 28


## Balance data

In [7]:
count_per_class = 28
data = list(load_annotations(ANNOTATIONS_PATH, annotation_specification))
balanced_data = []
for label in annotation_specification:
    balanced_data.extend([d for d in data if d[1] == label][:count_per_class])
audio_files, audio_labels = zip(*balanced_data)

In [8]:
inspect_label_counts()

maninka    (0): 28
susu       (1): 28
pular      (2): 28


# Prepare 10 cross validation folds

In [9]:
TRAIN_PERCENT = .6
FOLD_COUNT = 10

n = len(audio_files)
n_train = int(np.ceil(n * .6))
n_test = n - n_train
all_indices = range(n)

cv_folds = {}
train_count_by_index = {i:0 for i in all_indices}
test_count_by_index = {i:0 for i in all_indices}

for fold_index in range(FOLD_COUNT):
    fold_rsampler = np.random.RandomState(seed=fold_index)
    train_index_set = set(fold_rsampler.choice(all_indices, n_train, replace=False))
    test_index_set = set(all_indices).difference(train_index_set)
        
    cv_folds[fold_index] = {
        'train_indices': train_index_set,
        'test_indices': train_index_set,
    }


# Load features

In [10]:
def load_features(audio_files, features_input_dir):
    id_list = []
    features_list = []

    for audio_file_name in audio_files:
        feature_file_name = audio_file_name.replace(".wav", ".h5context")
        feature_path = Path(features_input_dir) / feature_file_name
        with h5py.File(feature_path, 'r') as f:
            features_shape = f['info'][1:].astype(int)
            features = np.array(f['features'][:]).reshape(features_shape)
            # features = pool_feature_last_seq(features)
            features_list.append(features)
    return features_list

In [11]:
raw_features = {}
for feature_dir in FEATURE_DIRS:
    feature_name = Path(feature_dir).stem
    raw_features[feature_name] = load_features(audio_files, feature_dir)

## Inspect feature shapes

In [12]:
for feature_name in raw_features.keys():
    print("feature_name: {}. feature shape: {}".format(
        to_user_friendly_feature_name(feature_name),
        raw_features['wav2vec_features-c'][0].shape
    ))

feature_name: c. feature shape: (2998, 512)
feature_name: z. feature shape: (2998, 512)
feature_name: retrained-c. feature shape: (2998, 512)
feature_name: retrained-z. feature shape: (2998, 512)


## Extract feature vectors

In [13]:
def extract_last_timestep_features(raw_features):
    return raw_features[-1, :]

def extract_neuron_average_features(raw_features):
    return np.mean(raw_features, axis=0)

def identity(x):
    return x

feature_extractors = {
    'last_timestep': extract_last_timestep_features,
    'neuron_average': extract_neuron_average_features,
    'raw_features': identity
}

In [14]:
feature_vectors = {}
for feature_name in raw_features.keys():
    for feature_extractor_name in feature_extractors.keys():
        fv_name = f"{feature_name}__{feature_extractor_name}"
        feature_vectors[fv_name] = []
        for f in raw_features[feature_name]:
            feature_vectors[fv_name].append(
                feature_extractors[feature_extractor_name](f)
            )
            
        feature_vectors[fv_name] = np.array(feature_vectors[fv_name])

## Inspect feature vectors

In [15]:
for fvname in feature_vectors.keys():
    print(fvname, feature_vectors[fvname][0].shape)

wav2vec_features-c__last_timestep (512,)
wav2vec_features-c__neuron_average (512,)
wav2vec_features-c__raw_features (2998, 512)
wav2vec_features-z__last_timestep (512,)
wav2vec_features-z__neuron_average (512,)
wav2vec_features-z__raw_features (2998, 512)
retrained-wav2vec_features-c__last_timestep (512,)
retrained-wav2vec_features-c__neuron_average (512,)
retrained-wav2vec_features-c__raw_features (2998, 512)
retrained-wav2vec_features-z__last_timestep (512,)
retrained-wav2vec_features-z__neuron_average (512,)
retrained-wav2vec_features-z__raw_features (2998, 512)


# Classification Models

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_model_summary import summary
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

In [43]:
class LangIdConvNet(nn.Module):
    def __init__(self):
        super(LangIdConvNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=512, out_channels=16, kernel_size=3)
        self.drop1 = nn.Dropout(p=0.2)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3)
        self.drop2 = nn.Dropout(p=0.2)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=3)
        self.drop3 = nn.Dropout(p=0.2)
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.conv4 = nn.Conv1d(in_channels=16, out_channels=3, kernel_size=3)
        self.pool4 = nn.MaxPool1d(kernel_size=2, stride=2)
        
    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = F.elu(x)
        x = self.drop1(x)
        x = self.pool1(x)
        
        x = self.conv2(x)
        x = F.elu(x)
        x = self.drop2(x)
        x = self.pool2(x)
        
        x = self.conv3(x)
        x = F.elu(x)
        x = self.drop3(x)
        x = self.pool3(x)
        
        x = self.conv4(x)
        x = F.elu(x)
        x = self.pool4(x)
        
        x = torch.mean(x, dim=2)
        
        return x
        

# Train Classification Models

In [21]:
dataset = TensorDataset(torch.tensor(raw_features['wav2vec_features-c']), torch.tensor(audio_labels))
loader = DataLoader(dataset, batch_size=5)

In [None]:
model = LangIdConvNet()
x = torch.tensor(raw_features['wav2vec_features-c'][:10])
print(x.shape)
print(summary(LangIdConvNet(), x, show_input=False))

    

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(500):
    model.train()
    running_loss = 0
    for batch_idx, (x, y) in enumerate(loader):
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(running_loss / len(loader))
        
        
    

torch.Size([10, 2998, 512])
-----------------------------------------------------------------------
      Layer (type)        Output Shape         Param #     Tr. Param #
          Conv1d-1      [10, 16, 2996]          24,592          24,592
         Dropout-2      [10, 16, 2996]               0               0
       MaxPool1d-3      [10, 16, 1498]               0               0
          Conv1d-4      [10, 16, 1496]             784             784
         Dropout-5      [10, 16, 1496]               0               0
       MaxPool1d-6       [10, 16, 748]               0               0
          Conv1d-7       [10, 16, 746]             784             784
         Dropout-8       [10, 16, 746]               0               0
       MaxPool1d-9       [10, 16, 373]               0               0
         Conv1d-10        [10, 3, 371]             147             147
      MaxPool1d-11        [10, 3, 185]               0               0
Total params: 26,307
Trainable params: 26,307
No

In [25]:
for fold_index in cv_folds:
    x_train = 
    print(fold_index)

SyntaxError: invalid syntax (<ipython-input-25-28b029bbbd41>, line 2)

In [None]:
def get_sample_weights(y):
    count_per_class = {k: len(list(g)) for k, g in groupby(sorted(y))}
    class_count = len(count_per_class)
    weight_per_class = {k: 1/class_count/c for k, c in count_per_class.items()}
    return [weight_per_class[yi] for yi in y]

In [None]:
train_size = len(audio_labels) // 2

plt.figure(figsize=(30, 15))

for index, fv_name in enumerate(feature_vectors.keys()):
    X = feature_vectors[fv_name]
    
    x_train = X[:train_size]
    y_train = audio_labels[:train_size]

    x_test = X[train_size:]
    y_test = audio_labels[train_size:]

    svc = SVC(kernel="poly")
    svc.fit(x_train, y_train)

    train_acc = svc.score(x_train, y_train, sample_weight=get_sample_weights(y_train))
    test_acc = svc.score(x_test, y_test, sample_weight=get_sample_weights(y_test))
    print(fv_name, train_acc, test_acc)
    
    print(f'plt.subplot({2}, {int(np.ceil(len(feature_vectors.keys())/2))}, {index+1})')
    plt.subplot(2, int(np.ceil(len(feature_vectors.keys())/2)), index+1)
    
    projected_x = TSNE(n_components=2).fit_transform(feature_vectors['retrained-wav2vec_features-c__neuron_average'])
    plt.scatter(projected_x[:, 0], projected_x[:, 1], c=[COLORS[l] for l in audio_labels])
    title = to_user_friendly_feature_name(fv_name)
    
    title = f"{title} SVM({train_acc:.02%}, {test_acc:.02%})"
    
    plt.title(title)


print(train_acc, test_acc)

In [18]:
import torch