In [2]:
import os

import torch as th
from torch.utils.data import Dataset, Sampler, DataLoader
from transformers import BertTokenizer, BertModel

from sentence_transformers import SentenceTransformer

import numpy as np
import scipy.linalg as linalg

from tqdm.notebook import tqdm
import matplotlib as mp
import matplotlib.pyplot as plt
import matplotlib.colors as colors

from sklearn.manifold import TSNE, MDS, Isomap
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import seaborn as sns

import faiss

from examples.speech_to_text.data_utils import load_df_from_tsv

In [3]:
root = '/mnt/raid0/siqi/datasets/covost2'
langs = ["fr", "de", "es", "fa", "it", "ru", "pt", "zh-CN", "tr", "ar", "et", "mn", "nl", "sv-SE", "lv", "sl", "ta", "ja", "id"]
os.makedirs('resources', exist_ok=True)
device='cuda:1'
n_layer = 12

In [4]:
ver = 'std'

class MyDataset(Dataset):
    def __init__(self, langs, layer_id):
        self.langs = langs
        self.layer_id = layer_id
        self.data = []
        self.labels = [] 
        for lang_id, lang in enumerate(langs):
            for batch_idx in tqdm(os.listdir('/mnt/raid0/siqi/analysis/resources-{}/{}'.format(ver, lang)), desc='Layer {} Lang {}'.format(layer_id, lang)):
                x_per_layer = th.load('/mnt/raid0/siqi/analysis/resources-{}/{}/{}'.format(ver, lang, batch_idx), map_location='cpu')
                self.data.extend(x_per_layer[layer_id])
                self.labels.extend([lang_id] * len(x_per_layer[layer_id]))

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

    def __len__(self):
        return len(self.data)

# datasets = [MyDataset(langs, layer_id) for layer_id in range(n_layer)]
# th.save(datasets, 'resources/{}_datasets_per_layer.pt'.format(ver))

datasets = th.load('resources/{}_datasets_per_layer.pt'.format(ver))

In [5]:
feature_sets = [[] for _ in range(n_layer)]
n_labels = [0] * len(langs)
for layer_id in range(n_layer):
    dataset = datasets[layer_id]
    for ft, label in tqdm(dataset, desc='Layer {}'.format(layer_id)):
        feature_sets[layer_id].append(ft.numpy())
        if layer_id == 0:
            n_labels[label] += 1
    feature_sets[layer_id] = np.stack(feature_sets[layer_id], axis=0)

Layer 0:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 1:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 2:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 3:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 4:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 5:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 6:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 7:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 8:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 9:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 10:   0%|          | 0/83360 [00:00<?, ?it/s]

Layer 11:   0%|          | 0/83360 [00:00<?, ?it/s]

In [6]:
projector = TSNE(n_components=2, perplexity=60, init='pca', verbose=2, learning_rate='auto')

In [8]:
for layer_id in [11]:
    print('Processing Layer {}'.format(layer_id))
    dots = projector.fit_transform(feature_sets[layer_id])

    dots_per_lang = []
    cnt = 0
    for lang_id in range(len(langs)):
        dots_per_lang.append(dots[cnt:cnt+n_labels[lang_id]])
        cnt += n_labels[lang_id]

    os.makedirs('figures/{}/layer_{}'.format(ver, layer_id))

    for i, lang_i in enumerate(langs):
        for j, lang_j in enumerate(langs):
            if i < j:
                df = pd.DataFrame(columns=['x', 'y', 'lang'])
                xs = []
                ys = []
                ls = []
                xs.extend(dots_per_lang[i][:, 0].tolist())
                ys.extend(dots_per_lang[i][:, 1].tolist())
                xs.extend(dots_per_lang[j][:, 0].tolist())
                ys.extend(dots_per_lang[j][:, 1].tolist())
                ls.extend([lang_i] * n_labels[i])
                ls.extend([lang_j] * n_labels[j])
                df['x'] = xs
                df['y'] = ys
                df['lang'] = ls
                sns.jointplot(data=df, x='x', y='y', hue='lang', kind='kde')
                plt.savefig('figures/{}/layer_{}/{}_{}_{}.pdf'.format(ver, layer_id, ver, lang_i, lang_j))
                plt.close()

Processing Layer 11
[t-SNE] Computing 181 nearest neighbors...
[t-SNE] Indexed 83360 samples in 0.041s...
[t-SNE] Computed neighbors for 83360 samples in 211.250s...
[t-SNE] Computed conditional probabilities for sample 1000 / 83360
[t-SNE] Computed conditional probabilities for sample 2000 / 83360
[t-SNE] Computed conditional probabilities for sample 3000 / 83360
[t-SNE] Computed conditional probabilities for sample 4000 / 83360
[t-SNE] Computed conditional probabilities for sample 5000 / 83360
[t-SNE] Computed conditional probabilities for sample 6000 / 83360
[t-SNE] Computed conditional probabilities for sample 7000 / 83360
[t-SNE] Computed conditional probabilities for sample 8000 / 83360
[t-SNE] Computed conditional probabilities for sample 9000 / 83360
[t-SNE] Computed conditional probabilities for sample 10000 / 83360
[t-SNE] Computed conditional probabilities for sample 11000 / 83360
[t-SNE] Computed conditional probabilities for sample 12000 / 83360
[t-SNE] Computed conditiona



[t-SNE] Iteration 50: error = 70.1111984, gradient norm = 0.0073721 (50 iterations in 33.699s)
[t-SNE] Iteration 100: error = 71.8012390, gradient norm = 0.0048727 (50 iterations in 22.683s)
[t-SNE] Iteration 150: error = 72.1767960, gradient norm = 0.0037976 (50 iterations in 23.704s)
[t-SNE] Iteration 200: error = 72.7978821, gradient norm = 0.0023572 (50 iterations in 25.217s)
[t-SNE] Iteration 250: error = 73.3558197, gradient norm = 0.0035249 (50 iterations in 27.215s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 73.355820
[t-SNE] Iteration 300: error = 2.4213743, gradient norm = 0.0002752 (50 iterations in 37.242s)
[t-SNE] Iteration 350: error = 2.2863576, gradient norm = 0.0001205 (50 iterations in 43.432s)
[t-SNE] Iteration 400: error = 2.2237241, gradient norm = 0.0000829 (50 iterations in 41.758s)
[t-SNE] Iteration 450: error = 2.1755471, gradient norm = 0.0000709 (50 iterations in 39.767s)
[t-SNE] Iteration 500: error = 2.1359429, gradient norm = 0.000

  cset = contour_func(


In [9]:
all_features = []
n_labels = []
for i in tqdm(range(len(langs))):
    features = np.load('resources/{}.npy'.format(langs[i]))
    n_labels.append(features.shape[0])
    all_features.append(features)
all_features = np.concatenate(all_features, axis=0)

  0%|          | 0/19 [00:00<?, ?it/s]

In [10]:
print('Processing Reference')
dots = projector.fit_transform(all_features)

dots_per_lang = []
cnt = 0
for lang_id in range(len(langs)):
    dots_per_lang.append(dots[cnt:cnt+n_labels[lang_id]])
    cnt += n_labels[lang_id]

os.makedirs('figures/ref')

for i, lang_i in enumerate(langs):
    for j, lang_j in enumerate(langs):
        if i < j:
            df = pd.DataFrame(columns=['x', 'y', 'lang'])
            xs = []
            ys = []
            ls = []
            xs.extend(dots_per_lang[i][:, 0].tolist())
            ys.extend(dots_per_lang[i][:, 1].tolist())
            xs.extend(dots_per_lang[j][:, 0].tolist())
            ys.extend(dots_per_lang[j][:, 1].tolist())
            ls.extend([lang_i] * n_labels[i])
            ls.extend([lang_j] * n_labels[j])
            df['x'] = xs
            df['y'] = ys
            df['lang'] = ls
            sns.jointplot(data=df, x='x', y='y', hue='lang', kind='kde')
            plt.savefig('figures/ref/ref_{}_{}.pdf'.format(lang_i, lang_j))
            plt.close()

Processing Layer 11
[t-SNE] Computing 181 nearest neighbors...
[t-SNE] Indexed 81360 samples in 0.030s...


KeyboardInterrupt: 