In [1]:
import os
from matplotlib import pyplot as plt
import time
import numpy as np
import pandas as pd
import math
import glob
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torchaudio
import torchaudio.functional as torchaudio_F
import torchaudio.transforms as torchaudio_T
from torchaudio.backend.soundfile_backend import load
import tqdm

from src.dataset_v2 import *
from src.model import CPC, CPC_classifier_v3

In [2]:
wav_path = os.path.join(os.path.expanduser('~'), 'Jupyter-data/Switchboard-DA')
trainset_path = './train.csv'
testset_path = './test.csv'
df_train, top_labels = build_dataset(wav_path, trainset_path, numeric_only=False)
df_test, _ = build_dataset(wav_path, testset_path, top_labels=top_labels, numeric_only=False)

[('x', 29998), ('sd', 19364), ('b', 10028), ('sv', 7428), ('+', 4919), ('%', 4294), ('aa', 3014), ('ba', 1112), ('qy', 1043), ('ny', 729)]


## Get CPC embedding

In [None]:
model = CPC(decoder_heads=12)
model.load_state_dict(torch.load('./libri-182794-fw3-h12-ep4.pth'))
model.cuda()

In [3]:
batch_size = 16
df_target_dataset = df_test

dataset = Switchboard_Dataset_v1(df_target_dataset)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=padding_tensor_extractor_v1)
print(dataset.__len__())

15139


In [21]:
# Get CPC embedding
model.eval()

c_t_collect = []

with torch.no_grad():
    for x, x_len in tqdm.tqdm(dataloader):
        phi_n, c_n = model(x.to('cuda'))

        t = (x_len/8000*100).astype(int) - 1
        c_t = torch.zeros((x.shape[0], model.c_dim))
        for i in range(x.shape[0]):
            c_t[i] = c_n[i,t[i],:].cpu()
    
        c_t_collect.append(c_t)

tensot_c_t = torch.concat(c_t_collect)

100%|█████████████████████████████████████████| 947/947 [00:31<00:00, 30.25it/s]


In [22]:
df_extracted_feature = df_target_dataset[['dialog_id', 'speaker', 'da_tag', 'start_time', 'end_time', 'label']]
df_extracted_feature = pd.concat([df_extracted_feature, pd.DataFrame(tensot_c_t.numpy())], axis=1)
df_extracted_feature.to_csv('./feature_cpc_182794_test.csv', index=False)

## Get CLF embedding

In [3]:
model_clf = CPC_classifier_v3(
    phi_dim=256,
    c_dim=128,
    rnn_num_layers=1,
)
model_clf.load_state_dict(torch.load('./sw-clfv3-vF-step160k.pth'))
model_clf.cuda()
model_clf.eval()
print('ready')

ready


In [7]:
class CLF_Head_no_softmax(nn.Module):
    def __init__(self, input_dim=128, output_dim=10):
        super().__init__()
        
        self.mlp = nn.Sequential( # downsampling factor = 160
            nn.Linear(input_dim, input_dim),
            nn.BatchNorm1d(input_dim),
            nn.ReLU(inplace=True),
            nn.Linear(input_dim, input_dim),
            nn.BatchNorm1d(input_dim),
            nn.ReLU(inplace=True),
        )
      
    def forward(self, x):
        out = self.mlp(x)
        
        return out

decoder = CLF_Head_no_softmax()

# copy decoder params and buffer
params_clf = model_clf.named_parameters()
buffer_clf = model_clf.named_buffers()
params_decoder = decoder.named_parameters()
buffer_decoder = decoder.named_buffers()

state_dict_decoder = {**dict(params_decoder), **dict(buffer_decoder)}

for name, param in params_clf:
    if name in state_dict_decoder:
        state_dict_decoder[name].data.copy_(param.data)
        
for name, param in buffer_clf:
    if name in state_dict_decoder:
        state_dict_decoder[name].data.copy_(param.data)
        
decoder.load_state_dict(state_dict_decoder)
decoder.cuda()

CLF_Head_no_softmax(
  (mlp): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=128, out_features=128, bias=True)
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
)

In [10]:
batch_size = 32
# df_target_dataset = df_train
df_target_dataset = df_test

dataset = Switchboard_Dataset_trainer_v3(df_target_dataset)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=padding_tensor_trainer_v3)

print(dataset.__len__())

15139


In [16]:
# Get clf result
c_t_collect = []

with torch.no_grad():
    for x, y in tqdm.tqdm(dataloader):
        phi_n = model_clf.encoder(x[:,0:1,:].to('cuda'))
        ind_n = model_clf.encoder2(x[:,1:2,:].to('cuda'))
        phi_cat = torch.concat((phi_n,ind_n),1)

        c_n, h_n = model_clf.auto_regressive(torch.permute(phi_cat, (0, 2, 1)))
        embed = decoder(h_n[0])
    
        c_t_collect.append(embed.cpu())

tensot_c_t = torch.concat(c_t_collect)

100%|█████████████████████████████████████████| 474/474 [00:35<00:00, 13.23it/s]


In [17]:
df_extracted_feature = df_target_dataset[['dialog_id', 'speaker', 'da_tag', 'start_time', 'end_time', 'label']]
df_extracted_feature = pd.concat([df_extracted_feature, pd.DataFrame(tensot_c_t.numpy())], axis=1)
df_extracted_feature.to_csv('./feature_clfvF_160k_test.csv', index=False)