In [1]:
import pandas as pd
import numpy as np
import pickle as pkl

In [2]:
!nvidia-smi

Sun Mar 26 14:05:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| 36%   60C    P0   149W / 350W |    402MiB / 24576MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load GoEmotions and Audio Datasets (CREMA, TESS, ETC)

In [3]:
train_audio = pkl.load(open('./data/c4ai_clip/train_audio.pkl', "rb"))[['path', 'label']]
test_audio = pkl.load(open('./data/c4ai_clip/test_audio.pkl', "rb"))[['path', 'label']]
train_text = pkl.load(open('./data/c4ai_clip/train_text.pkl', "rb"))[['text', 'grouped_label']]
test_text = pkl.load(open('./data/c4ai_clip/test_text.pkl', "rb"))[['text', 'grouped_label']]

In [4]:
#!unzip ./data/c4ai_clip/audio_emo_resampled.zip -d ./audio

In [5]:
def norm_labels(x):
    if x == "afraid":
        return "fear"
    elif x == "angry":
        return "anger"
    elif x == "disgusted":
        return "disgust"
    elif x == "sad":
        return "sadness"
    else:
        return x

In [6]:
train_audio["label"] = train_audio["label"].apply(norm_labels)
test_audio["label"] = test_audio["label"].apply(norm_labels)

In [7]:
test_audio

Unnamed: 0,path,label
11545,./audio/audio_emo/crema.man.sad.465.wav,sadness
1970,./audio/audio_emo/ravdass.man.sad.74.wav,sadness
6326,./audio/audio_emo/tess.woman.surprised.370.wav,surprise
11579,./audio/audio_emo/tess.woman.neutral.110.wav,neutral
9502,./audio/audio_emo/crema.woman.happy.586.wav,joy
...,...,...
8732,./audio/audio_emo/crema.man.angry.530.wav,anger
5386,./audio/audio_emo/tess.woman.happy.322.wav,joy
83,./audio/audio_emo/crema.man.afraid.455.wav,fear
6755,./audio/audio_emo/crema.woman.disgusted.239.wav,disgust


## Load Meld

In [8]:
def load_split_meld(split):
    assert split in ['train', 'test', 'dev']
    
    df = pd.read_csv(f"./meld_raw/{split}_splits/{split}_sent_emo.csv")
    df['path'] = df.apply(lambda x: f"./meld_raw/{split}_splits/audios/dia{x['Dialogue_ID']}_utt{x['Utterance_ID']}.wav".strip('\n'), axis=1)
    
    return (
        df[['path', 'Emotion']]
        .rename(columns={'Emotion':'label'})
    )

In [9]:
df_meld_train = load_split_meld('train')
df_meld_dev = load_split_meld('dev')
df_meld_test = load_split_meld('test')

In [10]:
df_meld_dev

Unnamed: 0,path,label
0,./meld_raw/dev_splits/audios/dia0_utt0.wav,sadness
1,./meld_raw/dev_splits/audios/dia0_utt1.wav,surprise
2,./meld_raw/dev_splits/audios/dia1_utt0.wav,neutral
3,./meld_raw/dev_splits/audios/dia1_utt1.wav,joy
4,./meld_raw/dev_splits/audios/dia1_utt2.wav,sadness
...,...,...
1104,./meld_raw/dev_splits/audios/dia113_utt9.wav,sadness
1105,./meld_raw/dev_splits/audios/dia113_utt10.wav,sadness
1106,./meld_raw/dev_splits/audios/dia113_utt11.wav,sadness
1107,./meld_raw/dev_splits/audios/dia113_utt12.wav,sadness


## Load Iemocap

In [11]:
def load_iemocap(path):
    def normalize_labels(label):
        if label == 'neu':
            return 'neutral'
        elif label == 'sad' or label == 'fru':
            return 'sadness'
        elif label == 'fea':
            return 'fear'
        elif label == 'dis':
            return 'disgust'
        elif label == 'sur':
            return 'surprise'
        elif label == 'ang':
            return 'anger'
        elif label == 'hap' or label == 'exc':
            return 'joy'
        else:
            return 'xxx'
        
    df = pd.read_csv(f'{path}/df_iemocap.csv')[['wav_file', 'emotion']]
    df['label'] = df['emotion'].apply(normalize_labels)
    print(df['label'].unique())
    df = df[df['label'] != 'xxx']
    df = df[['wav_file', 'label']]
    df['wav_file'] = df['wav_file'].apply(lambda x: f"{path}/audios/{x}.wav")
    df['split'] = df['wav_file'].apply(lambda x: 'train' if not '05' in x.split('_')[0][:-1] else 'test')
    df = df.rename(columns={'wav_file':'path'})
    train, test = df[df['split'] == 'train'], df[df['split'] == 'test']
    return train, test

In [12]:
#!unzip /content/drive/MyDrive/iemocap.zip -d .

In [13]:
#df = pd.read_csv(f'./df_iemocap.csv')[['wav_file', 'emotion']]

In [14]:
#df["emotion"].unique()

In [15]:
df_iemocap_train, df_iemocap_test = load_iemocap('.')

['neutral' 'xxx' 'sadness' 'anger' 'joy' 'surprise' 'fear' 'disgust']


In [16]:
df_iemocap_train['label'].unique()

array(['neutral', 'sadness', 'anger', 'joy', 'surprise', 'fear',
       'disgust'], dtype=object)

## Join datasets (audio)

In [17]:
# df_iemocap_train, df_iemocap_test, df_meld_train, df_meld_dev, df_meld_test
#train_audio 
#test_audio
#train_text 
#test_text

In [18]:
from os.path import exists

#file_exists = exists(path_to_file)

In [19]:
df_train_audio = pd.concat([df_iemocap_train[['path', 'label']], df_meld_train, train_audio], axis=0)

In [20]:
#df_train_audio = df_iemocap_train[['path', 'label']]

In [21]:
df_dev_audio = pd.concat([df_meld_dev, test_audio], axis=0)

In [22]:
#df_dev_audio = df_iemocap_test[['path', 'label']]
df_dev_audio = test_audio

In [23]:
len(df_train_audio)

25213

In [24]:
len(df_dev_audio)

2337

In [25]:
aud = []
for f in df_train_audio["path"]:
    if not exists(f):
        aud.append(f)
df_train_audio = df_train_audio[~df_train_audio['path'].isin(aud)]

In [26]:
aud = []
for f in df_dev_audio["path"]:
    if not exists(f):
        aud.append(f)
df_dev_audio = df_dev_audio[~df_dev_audio['path'].isin(aud)]

In [27]:
len(df_dev_audio)

2337

In [28]:
df_train_audio['label'].unique()

array(['neutral', 'sadness', 'anger', 'joy', 'surprise', 'fear',
       'disgust'], dtype=object)

In [29]:
df_dev_audio['label'].unique()

array(['sadness', 'surprise', 'neutral', 'joy', 'disgust', 'anger',
       'fear'], dtype=object)

In [30]:
train_text['grouped_label'].unique()

array(['neutral', 'anger', 'fear', 'surprise', 'joy', 'sadness',
       'disgust'], dtype=object)

In [31]:
print(len(train_text), len(df_train_audio), len(test_text), len(df_dev_audio))


43410 25212 5427 2337


In [32]:
from sklearn.preprocessing import LabelEncoder

lab_encoder = LabelEncoder()
lab_encoder.fit(df_train_audio['label'].unique())

In [33]:
import torch
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm

In [34]:
import torch
import torch.nn as nn

class SupConLoss(nn.Module):
    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
    It also supports the unsupervised contrastive loss in SimCLR"""
    def __init__(self, temperature=0.2, contrast_mode='all',
                 base_temperature=0.2):
        super(SupConLoss, self).__init__()
        self.temperature = temperature
        self.contrast_mode = contrast_mode
        self.base_temperature = base_temperature

    def forward(self, features, labels=None, mask=None, temperature=None, base_temperature=None):
        """Compute loss for model. If both `labels` and `mask` are None,
        it degenerates to SimCLR unsupervised loss:
        https://arxiv.org/pdf/2002.05709.pdf
        Args:
            features: hidden vector of shape [bsz, n_views, ...].
            labels: ground truth of shape [bsz].
            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                has the same class as sample i. Can be asymmetric.
        Returns:
            A loss scalar.
        """
        if temperature == None:
            temperature = self.temperature
        if base_temperature == None:
            base_temperature = self.base_temperature
        device = (torch.device('cuda')
                  if features.is_cuda
                  else torch.device('cpu'))

        if len(features.shape) < 3:
            raise ValueError('`features` needs to be [bsz, n_views, ...],'
                             'at least 3 dimensions are required')
        if len(features.shape) > 3:
            features = features.view(features.shape[0], features.shape[1], -1)

        batch_size = features.shape[0]
        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError('Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)
        else:
            mask = mask.float().to(device)

        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # compute logits
        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)

        # loss
        loss = - (temperature/base_temperature) * mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss

In [35]:
class EmbeddingPropagation(nn.Module):
    """Embedding Propagation"""
    def __init__(self, in_dim=2048, emb_dim=384, graph_emb=128, k=128, num_classes=7, eps=1e-8, keep_rate=0.8, alpha=0.9):
        super(EmbeddingPropagation, self).__init__()
        self.k = k
        self.keep_rate = keep_rate
        self.num_classes = num_classes
        self.g_enc = nn.Sequential(nn.Linear(in_dim, graph_emb), nn.GELU(), nn.Linear(graph_emb, graph_emb))
        self.sigma_enc = nn.Sequential(nn.Linear(graph_emb, graph_emb//4), nn.GELU(), nn.Linear(graph_emb//4, 1))
        self.eps = eps
        self.alpha = nn.Parameter(torch.tensor([alpha]).cuda(0), requires_grad=True)
        #self.alpha = nn.Parameter(torch.tensor([alpha]), requires_grad=True)

    def graph_construction(self, inputs, k=128, keep_rate=0.6):
        # Get Graph Embeddings
        emb_all = self.g_enc(inputs)
        N, d    = emb_all.shape[0], emb_all.shape[1]

        self.sigmas = self.sigma_enc(emb_all)
        
        # Get adjacency matrix
        emb_all_sigma = emb_all / (self.sigmas+self.eps) # N*d
        W = torch.cdist(emb_all_sigma, emb_all_sigma)
        W = torch.exp(-W/2)

        # Keep topk nodes for neighborhood
        _, indices = torch.topk(W, k)

        mask = torch.zeros_like(W)
        mask = mask.scatter(1, indices, 1)
        mask = ((mask+torch.t(mask))>0).type(torch.float32)      # union, kNN graph
        W = W * mask

        # Dropout edges
        if self.training == True and keep_rate < 1.0:
            dropout_mask = torch.rand(*W.shape, requires_grad=True).cuda() < self.keep_rate
            #dropout_mask = torch.rand(*W.shape, requires_grad=True) < self.keep_rate
            W = W * dropout_mask
            
        # Graph Adjacency matrix normalization
        D = W.sum(0)
        D_sqrt_inv = torch.sqrt(1.0/(D+self.eps))
        D1 = torch.unsqueeze(D_sqrt_inv,1).repeat(1,N)
        D2 = torch.unsqueeze(D_sqrt_inv,0).repeat(N,1)
        S = D1*W*D2

        return S

    def forward(self, inputs):

        N = inputs.shape[0]

        S = self.graph_construction(inputs, k=self.k, keep_rate=self.keep_rate)
        x = torch.matmul(self.alpha*torch.inverse(torch.eye(N).cuda(0)-(1.0-self.alpha)*S + self.eps), inputs)

        return x, S

In [36]:
#text_enc

In [37]:
#!pip install transformers
from transformers import AutoTokenizer, AutoModel

class TextEncoder(nn.Module):

    def __init__(self, model_name, max_len):
        super(TextEncoder, self).__init__()

        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        _ = self.tokenizer.add_tokens(['[NAME]', '[RELIGION]'], special_tokens=True)
        self.encoder = AutoModel.from_pretrained(model_name)
        self.encoder.resize_token_embeddings(len(self.tokenizer))
        #self.encoder = text_enc
 
    def forward(self, sentences):

        x = self.tokenizer(sentences, padding='max_length', truncation=True, return_tensors='pt', max_length=self.max_len)
        x = {
            "input_ids":x["input_ids"].to(0),
            "attention_mask":x["attention_mask"].to(0)
        }
        x = self.encoder(**x)[0]
        x = x[:, 0, :]

        return x

In [38]:
#model_text = TextEncoder('sentence-transformers/paraphrase-MiniLM-L3-v2', max_len=60)
#model_text.to(0)
#model_text(["hello everyone", "i am a person ok"]).shape

In [39]:
import torch
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torchaudio
import numpy as np
from tqdm import tqdm

def load_audio(path):
    wavform, _ = torchaudio.load(path)
    output = torch.mean(wavform, dim=0)
    return np.array(output, dtype=float)

class AudioEncoder(nn.Module):

    def __init__(self, model_name):
        super(AudioEncoder, self).__init__()
        self.encoder = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, output_hidden_states=True)
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 
    def forward(self, audio_paths):
        
        audios = list(map(load_audio, audio_paths))
        
        inputs = self.feature_extractor(audios, sampling_rate=16000, padding=True, return_tensors="pt")
        
        input_values = inputs["input_values"].to(0)
        x = torch.mean(self.encoder(input_values=input_values).hidden_states[-1], dim=1)
        return x

## MFCC Extractor and KMeans Hidden units

In [40]:
import torch
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torchaudio
import numpy as np
from tqdm import tqdm
import functools
import math

def get_feats(x, sr):
    #x = torch.from_numpy(x).float()
    x = x.view(1, -1)

    mfccs = torchaudio.compliance.kaldi.mfcc(
        waveform=x,
        sample_frequency=sr,
        use_energy=True,
    )  # (time, freq)
    mfccs = mfccs.transpose(0, 1)  # (freq, time)
    deltas = torchaudio.functional.compute_deltas(mfccs)
    ddeltas = torchaudio.functional.compute_deltas(deltas)
    concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
    concat = concat.transpose(0, 1).contiguous()  # (freq, time)
    return concat

def mfcc_feature_extractor(path, desired_sr=16000):
    #with torch.no_grad():
    waveform, sample_rate = torchaudio.load(path, normalize=True, channels_first=True)
    if len(waveform.shape) == 2:
        waveform = torch.mean(waveform, dim=0).unsqueeze(dim=0)
    
    if sample_rate != desired_sr:
        transform = torchaudio.transforms.Resample(sample_rate, desired_sr)
        waveform = transform(waveform)

    mfcc = get_feats(waveform, desired_sr)
    return mfcc

In [41]:
def get_data_cluster(path):
    mfcc_audio = mfcc_feature_extractor(path)
    return mfcc_audio

In [42]:
get_data_cluster("./audio/audio_emo/tess.woman.sad.6.wav").shape

torch.Size([276, 39])

In [43]:
X = torch.cat(list(map(get_data_cluster, df_train_audio["path"])), dim=0)

In [44]:
X.shape

torch.Size([8176936, 39])

In [45]:
from sklearn.cluster import MiniBatchKMeans
N_CLUSTERS = 200
kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS,
                          random_state=0,
                          batch_size=2048,
                          max_iter=4000,
                          n_init=3).fit(X)

In [46]:
#import pickle
#pickle.dump(kmeans, open("/content/drive/MyDrive/c4ai_audio_text_clip_model/kmeans_100_clusters.pkl", 'wb'))

## Add mask to Transformer, try learned positional embeddings

In [47]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class AudioEncoderMFCCHU(nn.Module):

    def __init__(self, clusterization_model, emb_size=N_CLUSTERS+1, pad_idx=N_CLUSTERS, emb_dim=384, n_layers=3, padd_trunk=300, nheads=8, dropout=0.1):
        super(AudioEncoderMFCCHU, self).__init__()

        self.clusterization_model = clusterization_model
        self.embedding = nn.Embedding(emb_size, emb_dim, max_norm=True, padding_idx=pad_idx)
        self.emb_size = emb_size
        self.padd_trunk = padd_trunk
        self.pad_idx = pad_idx
        self.pos_encoder = PositionalEncoding(emb_dim, dropout)
        self.emb_dim = emb_dim

        self.transf_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=nheads)
        self.transf_enc = nn.TransformerEncoder(self.transf_layer, num_layers=n_layers, norm=nn.LayerNorm(emb_dim))


    def preprocess_audio(self, path):
        mfcc = mfcc_feature_extractor(path)
        token_ids = torch.Tensor(self.clusterization_model.predict(mfcc)).long().to(0)
        if len(token_ids) >= self.padd_trunk:
            token_ids = token_ids[:self.padd_trunk]
        else:
            repeat = torch.Tensor([self.pad_idx]*(self.padd_trunk-len(token_ids))).long().to(0)
            token_ids = torch.cat([token_ids,repeat], dim=0)
        
        return token_ids.unsqueeze(dim=0)

    def forward(self, audio_paths):
        
        with torch.no_grad():
            tokens = list(map(self.preprocess_audio, audio_paths))
            tks_tensor = torch.cat(tokens, axis=0) #.to(0)
            assert len(tks_tensor) == len(audio_paths)
        
        tks_tensor_lens = 1/torch.sum(tks_tensor != self.pad_idx, dim=-1)
        tks_tensor_lens = tks_tensor_lens.unsqueeze(dim=0).T

        emb = self.embedding(tks_tensor) * math.sqrt(self.emb_dim)
        emb = self.pos_encoder(emb)

        x = self.transf_enc(emb)
        #x = x[:,0,:]
        x = tks_tensor_lens*torch.sum(x, dim=1)
        return x

In [48]:
#ss (2x1) * (2x384)

In [49]:
model_audio = AudioEncoderMFCCHU(kmeans)
model_audio.to(0)

AudioEncoderMFCCHU(
  (embedding): Embedding(201, 384, padding_idx=200, max_norm=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transf_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
    )
    (linear1): Linear(in_features=384, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=384, bias=True)
    (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transf_enc): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384

In [50]:
#model_audio = AudioEncoderMFCCHU(kmeans)
#model_audio.to(0)

In [51]:
with torch.no_grad():
    print(model_audio(["./audio/audio_emo/tess.woman.sad.6.wav", "./audio/audio_emo/tess.woman.sad.98.wav"]).shape)

torch.Size([2, 384])


In [52]:
#model_audio(["./audio/audio_emo/tess.woman.sad.6.wav", "./audio/audio_emo/tess.woman.sad.98.wav"]).shape

In [53]:
#sss

In [54]:
class AudioTextCLIP(nn.Module):

    def __init__(self, text_encoder, audio_encoder, freeze_text_enc=False, freeze_audio_enc=False, in_features_text=384, in_features_audio=16, proj_size=128, rate=0.1, hidden_size=384, num_classes=7, use_graph_aug=False):
        super(AudioTextCLIP, self).__init__()

        self.audio_encoder = audio_encoder
        self.text_encoder = text_encoder

        if freeze_text_enc:
            for i, (name, param) in enumerate(list(self.text_encoder.named_parameters())):
                param.requires_grad = False
        
        if freeze_audio_enc:
            for i, (name, param) in enumerate(list(self.audio_encoder.named_parameters())):
                param.requires_grad = False

        self.use_graph_aug = use_graph_aug
        if use_graph_aug:
            self.graph_enc = EmbeddingPropagation(in_dim=hidden_size)
        self.proj = nn.Linear(hidden_size, proj_size)
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.rate = rate

        self.text_proj = nn.Sequential(nn.Linear(in_features_text, hidden_size), nn.GELU(), nn.Linear(hidden_size, hidden_size))
        self.audio_proj = nn.Sequential(nn.Linear(in_features_audio, hidden_size), nn.GELU(), nn.Linear(hidden_size, hidden_size))
        self.mods_proj = lambda x: x
 
    def forward(self, inp):

        sentences, audio_paths = inp

        if sentences != None and audio_paths != None:
            text_emb = self.text_encoder(sentences)
            audio_emb = self.audio_encoder(audio_paths)
            x = torch.cat([self.mods_proj(self.text_proj(text_emb)), self.mods_proj(self.audio_proj(audio_emb))], dim=0)
        elif sentences != None:
            text_emb = self.text_encoder(sentences)
            x = self.mods_proj(self.text_proj(text_emb))
        else:
            audio_emb = self.audio_encoder(audio_paths)
            x = self.mods_proj(self.audio_proj(audio_emb))

        # Graph Creation
        if self.use_graph_aug:
            x_neighboor_info, A = self.graph_enc(x)

        # Projection without neighboor info
        x1 = F.gelu(self.linear1(F.dropout(x, p=self.rate, training=self.training)))
        x1 = self.linear2(x1)
        x1 = F.normalize(x1, dim=-1)
        x1 = F.normalize(self.proj(x1), dim=-1)

        # Projection with neighboor info
        if self.use_graph_aug:
            x2 = F.gelu(self.linear1(F.dropout(x_neighboor_info, p=self.rate, training=self.training)))
        else:
            x2 = F.gelu(self.linear1(F.dropout(x, p=self.rate, training=self.training)))
        clf_emb = self.linear2(x2)
        x2 = F.normalize(clf_emb, dim=-1)
        x2 = F.normalize(self.proj(x2), dim=-1)

        if self.use_graph_aug:
            return x1, x2, clf_emb, A
        else:
            return x1, x2, clf_emb, None

In [55]:
print(len(train_text), len(df_train_audio), len(test_text), len(df_dev_audio))

43410 25212 5427 2337


In [56]:
df_train_audio['label'].unique()

array(['neutral', 'sadness', 'anger', 'joy', 'surprise', 'fear',
       'disgust'], dtype=object)

In [57]:
train_audio_repeated = pd.concat([df_train_audio, df_train_audio,df_train_audio,df_train_audio,df_train_audio,df_train_audio,df_train_audio, df_train_audio,df_train_audio,df_train_audio,df_train_audio,df_train_audio], axis=0).sample(frac=1).reset_index(drop=True)
test_audio_repeated = pd.concat([df_dev_audio, df_dev_audio,df_dev_audio,df_dev_audio,df_dev_audio,df_dev_audio], axis=0).sample(frac=1).reset_index(drop=True)

train_ds = torch.utils.data.TensorDataset(torch.Tensor(list(range(len(train_text)))))
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=768, shuffle=True)

test_ds = torch.utils.data.TensorDataset(torch.Tensor(list(range(len(test_text)))))
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=256, shuffle=False)

In [58]:
#gc.collect()

In [59]:
#gc.collect()

In [60]:
import torchaudio
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
gc.enable()

In [61]:

#!pip install faiss-cpu --no-cache

In [62]:
#!sudo apt install libomp-dev

In [63]:
import numpy as np
import faiss


class FaissKNeighbors:
    def __init__(self, k=5):
        self.index = None
        self.y = None
        self.k = k

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1])
        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        votes = self.y[indices]
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        return predictions

In [64]:
#torch.cuda.empty_cache()
#opt.zero_grad(set_to_none=True)
#del opt
#gc.collect()
#del supcon_model

In [None]:
use_graph_aug = False
#torch._dynamo.config.verbose=True
supcon_model = AudioTextCLIP(TextEncoder('sentence-transformers/paraphrase-MiniLM-L3-v2', max_len=60), 
                             model_audio, in_features_text=384, in_features_audio=384, hidden_size=512, proj_size=128, use_graph_aug=use_graph_aug)
supcon_loss = SupConLoss(temperature=0.3, contrast_mode='all', base_temperature=0.34)
supcon_model.to(0)
#supcon_model = torch.compile(supcon_model)
scaler = torch.cuda.amp.GradScaler()

step = 0
e = 0
patience = 9999
early_stop_flag = 0
old_acc = -float('inf')
opt = torch.optim.Adam(supcon_model.parameters(),lr=5e-5,betas=(0.9,0.98), eps=1e-8,weight_decay=1e-3)
epochs = 9999
nb_steps = 20

while e < epochs:
    supcon_model.train()
    epoch_loss = 0.0
    proj_val = []
    targets_val = []

    proj_train = []
    targets_train = []

    preds = []
    support = []

    for i, batch_indices in enumerate(tqdm(train_loader, total=len(train_loader))):

        sentences = list(train_text.iloc[batch_indices[0]]["text"])
        audio_paths = list(train_audio_repeated.iloc[batch_indices[0]]["path"])
        y_text, y_audio = torch.Tensor(lab_encoder.transform(list(train_text.iloc[batch_indices[0]]["grouped_label"]))), torch.Tensor(lab_encoder.transform(list(train_audio_repeated.iloc[batch_indices[0]]["label"])))

        target = torch.cat([y_text, y_audio])

        x = [sentences, audio_paths]
        
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            out1, out2, wide, _ = supcon_model(x)

            out1 = out1.unsqueeze(dim=1)
            out2 = out2.unsqueeze(dim=1)
            out = torch.cat([out1,out2], dim=1)

            loss = supcon_loss(out, labels=target)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(supcon_model.parameters(), 1.0)
        scaler.step(opt)
        scaler.update()
        opt.zero_grad(set_to_none=True)
        
        epoch_loss += loss.item()
        proj_train.append(np.array(F.normalize(wide.detach(), dim=-1).cpu()))
        targets_train.append(np.array(target.cpu()))

        del out1
        del out2
        #del out
        gc.collect()
        torch.cuda.empty_cache()
        #break

    proj_train = np.concatenate(proj_train, axis=0)
    targets_train = np.concatenate(targets_train, axis=0)


    #clf = LogisticRegression(max_iter=500, random_state=None, n_jobs=8)
    #mean = np.mean(proj_train,axis=0)
    #std = np.std(proj_train,axis=0)

    #proj_train = F.normalize(wide.cpu(), dim=-1)

    #clf = KNeighborsClassifier(n_neighbors=128)
    #clf.fit(proj_train, targets_train)
    #clf.fit(proj_train, targets_train)
    clf = FaissKNeighbors(k=128)
    clf.fit(proj_train, np.array(targets_train, dtype=np.int))
    #clf.predict(wide)

    epoch_loss = epoch_loss/len(train_loader)
    supcon_model.eval()
    preds_audios = []
    preds_texts = []
    targets_audios = []
    targets_texts = []
    
    for i, batch_indices in enumerate(tqdm(test_loader, total=len(test_loader))):
        with torch.no_grad():

            sentences = list(test_text.iloc[batch_indices[0]]["text"])
            audio_paths = list(test_audio_repeated.iloc[batch_indices[0]]["path"])
            y_text, y_audio = torch.Tensor(lab_encoder.transform(list(test_text.iloc[batch_indices[0]]["grouped_label"]))), torch.Tensor(lab_encoder.transform(list(test_audio_repeated.iloc[batch_indices[0]]["label"])))

            target = torch.cat([y_text, y_audio])

            x = [sentences, audio_paths]

            _, _, wide, _ = supcon_model(x)

            wide = np.array(F.normalize(wide.cpu(), dim=-1))

            #wide = wide / np.array([np.linalg.norm(wide, ord=2, axis=1)]).T
            
            pred = clf.predict(wide)

            preds_text = clf.predict(wide[:len(sentences)])
            preds_audio = clf.predict(wide[len(sentences):])

            assert len(wide) == len(pred)

            preds.append(pred)
            preds_audios.append(preds_audio)
            preds_texts.append(preds_text)
            proj_val.append(wide)
            targets_val.append(np.array(target.cpu()))

            targets_texts.append(np.array(target.cpu())[:len(sentences)])
            targets_audios.append(np.array(target.cpu())[len(sentences):])

            del x, target
            gc.collect()
            torch.cuda.empty_cache()

    proj_val = np.concatenate(proj_val, axis=0)
    targets_val = np.concatenate(targets_val, axis=0)
    preds = np.array(np.concatenate(preds, axis=0))

    preds_texts = np.concatenate(preds_texts, axis=0)
    targets_texts = np.concatenate(targets_texts, axis=0)
    preds_audios = np.concatenate(preds_audios, axis=0)
    targets_audios = np.concatenate(targets_audios, axis=0)

    audio_f1 = f1_score(targets_audios, preds_audios, average='macro')
    audio_acc = accuracy_score(targets_audios, preds_audios)

    text_f1 = f1_score(targets_texts, preds_texts, average='macro')
    text_acc = accuracy_score(targets_texts, preds_texts)

    curr_acc = f1_score(targets_val, preds, average='macro')
    curr_acc2 = accuracy_score(targets_val, preds)

    print(f'Text - KNN F1: {text_f1} Acc: {text_acc}')
    print(f'Audio - KNN F1: {audio_f1} Acc: {audio_acc}')
    print(f'General - KNN F1: {curr_acc} Acc: {curr_acc2}')
    if use_graph_aug:
        print(f"Alpha: {float(supcon_model.state_dict()['graph_enc.alpha'].item())}")

    idx = np.random.randint(len(proj_val), size=2000)

    # A[idx,:]
    proj_val_samp = proj_val[idx, :]
    targets_val_samp = targets_val[idx]
    #tsne = TSNE(n_components=2, learning_rate='auto', init='pca', perplexity=5).fit_transform(proj_val_samp)

    #sns.scatterplot(x=tsne[:, 0], y=tsne[:, 1], hue=targets_val_samp, palette='tab10')
    #plt.show()

    print(f'Epoch: {e + 1} - Train Loss: {epoch_loss}')
    e += 1
    
    if old_acc > curr_acc:
        early_stop_flag += 1
    else:
        #torch.save(supcon_model.state_dict(), f'/content/drive/MyDrive/pytorch_model_AudioTextCLIP_vSpec_epoch_{e}_with_200.bin')
        #torch.save(supcon_model.state_dict(), '/content/drive/MyDrive/pytorch_model_AudioTextCLIP_v2.bin')
        #torch.save(supcon_model.state_dict(), '/content/drive/MyDrive/pytorch_model_goemotions_minilm_l3.bin')
        old_acc = curr_acc
        early_stop_flag = 0
    torch.save(supcon_model.state_dict(), f'./pytorch_model_AudioTextCLIPvFinal_epoch_{e}_only_meld.bin')

100%|████████████████████████████████████████████████████████████████| 57/57 [03:35<00:00,  3.77s/it]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  clf.fit(proj_train, np.array(targets_train, dtype=np.int))
100%|████████████████████████████████████████████████████████████████| 22/22 [00:27<00:00,  1.24s/it]


Text - KNN F1: 0.1356409284206297 Acc: 0.4044591855537129
Audio - KNN F1: 0.07546821028401089 Acc: 0.14188317670904735
General - KNN F1: 0.12754305591361792 Acc: 0.2731711811313801
Epoch: 1 - Train Loss: 7.084063571796083


 56%|███████████████████████████████████▉                            | 32/57 [02:00<01:34,  3.77s/it]

In [None]:
gc.collect()

In [None]:
ss

In [None]:
import scipy
from collections import Counter

class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def distance(self, X1, X2):
        distance = scipy.spatial.distance.euclidean(X1, X2)
    
    def predict(self, X_test):
        final_output = []
        for i in range(len(X_test)):
            d = []
            votes = []
            for j in range(len(self.X_train)):
                dist = scipy.spatial.distance.euclidean(self.X_train[j] , X_test[i])
                d.append([dist, j])
            d.sort()
            d = d[0:self.k]
            for d, j in d:
                votes.append(self.y_train[j])
            ans = Counter(votes).most_common(1)[0][0]
            final_output.append(ans)
            
        return final_output
    
    def score(self, X_test, y_test):
        predictions = self.predict(X_test)
        return (predictions == y_test).sum() / len(y_test)

In [None]:
ss

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import sklearn

clf = KNN(k=128)
clf.fit(proj_train, np.array(targets_train, dtype=np.int))

In [None]:
clf.predict(wide)

In [None]:

!pip install faiss

In [None]:
!pip install faiss-cpu --no-cache

In [None]:
#proj_train, np.array(targets_train, dtype=np.int)

clf = FaissKNeighbors(k=128)
clf.fit(proj_train, np.array(targets_train, dtype=np.int))
clf.predict(wide)

In [None]:
clf.predict(wide[197].reshape(1, -1))

In [None]:
wide[196]

In [None]:
tsne = TSNE(n_components=2, learning_rate='auto', init='pca', perplexity=10).fit_transform(proj_val_samp)

In [None]:
proj_train.shape

In [None]:
wide.shape

In [None]:
gc.collect()

In [None]:
np.sum(proj_train, axis=1)

In [None]:
wide.shape

In [None]:
np.array([np.linalg.norm(wide, ord=2, axis=1)]).T.shape

In [None]:
np.linalg.norm(wide, ord=1, axis=1)

In [None]:
ss

In [None]:
#torch.save(supcon_model.state_dict(), '/content/drive/MyDrive/pytorch_model_AudioTextCLIP_v2_1.bin')

In [None]:
supcon_model.load_state_dict(torch.load('/content/drive/MyDrive/pytorch_model_AudioTextCLIP.bin'))

In [None]:
test = supcon_model([["I Hate you, i believe you are shit!", "you are my best friend, love you!"],None])[2]
torch.dot(F.normalize(test[0, :], dim=0), F.normalize(test[1, :], dim=0))

In [None]:
test = supcon_model([["The best man ever, keep the good work!", "you are my best friend, love you!"],None])[2]
torch.dot(F.normalize(test[0, :], dim=0), F.normalize(test[1, :], dim=0))

In [None]:
test = supcon_model([["I Hate you, i believe you are shit!", "you should not be alive"],None])[2]
torch.dot(F.normalize(test[0, :], dim=0), F.normalize(test[1, :], dim=0))

In [None]:
#torch.save(supcon_model.state_dict(), '/content/drive/MyDrive/pytorch_model_AudioTextCLIP.bin')

https://arxiv.org/pdf/1410.6903.pdf

In [None]:
proj_val.shape

In [None]:
targets_val.shape

In [None]:
patrickvonplaten/tiny-wav2vec2-no-tokenizer

In [None]:
!pip install transformers

In [None]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

In [None]:
model = Wav2Vec2ForSequenceClassification.from_pretrained("patrickvonplaten/tiny-wav2vec2-no-tokenizer")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("patrickvonplaten/tiny-wav2vec2-no-tokenizer")

In [None]:
model