In [None]:
from transformers import AutoModel, AutoTokenizer 
import torch
import pickle 
import numpy as np
import pandas as pd 
import re
from tqdm import tqdm
import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from torch import nn
import os
import torch.nn.functional as F
import torch.optim as optim
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from Attention_Augmented_Conv2d.attention_augmented_conv import AugmentedConv
use_cuda = torch.cuda.is_available()
from sklearn.metrics import f1_score
device = torch.device('cuda' if use_cuda else 'cpu')

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base",output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",  add_special_tokens=True,
                                                max_length=20, pad_to_max_length=True,normalization=True, truncation=True, padding= True, return_attention_mask=True)

In [None]:
data = pd.read_csv(r'D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/data/final_sarc_trainingset_twitter.csv')

In [None]:
data = data.sample(frac = 1) 
data['text'] = data['text'].str.replace(r'#([^\s:]+)', '')

In [None]:
irony = pd.read_csv('D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/data/final_training_irony.csv')

### Try with one sentence 

In [None]:
text = [i.split(' ') for i in data.text]
text_irony = [i.split(' ') for i in irony.text]

In [None]:
count = [len(i) for i in text]
count_irony =  [len(i) for i in text_irony]

In [None]:
pd.DataFrame([count, count_irony]).T.describe()

In [None]:
count.sort()

In [None]:
pd.Series(count).quantile(0.9)

In [None]:

plt.figure(figsize = (15,10))
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
sns.histplot(np.array(count), color='red')
sns.histplot(np.array(count_irony))
plt.xlim(0,60)
plt.legend(['sarcasm', 'irony'], prop={"size":15})
plt.axvline(20, 0,linestyle='--', color = 'blue')
plt.text(19,3750,'Cut-off length, third quantile',rotation=90, fontsize = 15)
plt.rc('xtick',labelsize=15)
plt.rc('ytick',labelsize=15)
plt.xlabel('Length tweets', fontsize = 20)
plt.ylabel('Count', fontsize = 20)
plt.title('Distribution tweets length, divided by sarcasm and irony', fontsize = 25)
plt.savefig('D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/Code/Plots/distribution_word_length.eps', format='eps')

In [None]:
input_ids = torch.tensor([tokenizer.encode(data.text[58445], max_length=25, truncation=True)])

In [None]:
with torch.no_grad():
    features = bertweet(input_ids)

In [None]:
#each tensor is n_batches x n_tokens x 768 features f
encoder_embedding = features[2][0] #word embeddings
encoder_1 = features[2][1] 
encoder_2 = features[2][2]
encoder_3 = features[2][3]
encoder_4 = features[2][4]
encoder_5 = features[2][5]
encoder_6 = features[2][6]
encoder_7 = features[2][7]
encoder_8 = features[2][8]
encoder_9 = features[2][9]
encoder_10 = features[2][10]
encoder_11 = features[2][11]
encoder_12 = features[2][12] #last hidden encoder layer output 

In [None]:
features[2][1].size()

In [None]:
def zero_padding(tensor):
    length = tensor.size(1)
    if length >= 20:
        max_tensor = torch.squeeze(tensor).T[:,:20]
    else:
        max_tensor = F.pad(torch.squeeze(tensor).T, pad=(0, abs(torch.squeeze(tensor).T.size(1) - 20)), mode='constant', value=0)
    return torch.unsqueeze(max_tensor.T,0)

In [None]:
sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1) 

### Concatenate the vectors from the last four layers. 
It seems that the last 4 hidden output layers, if the concatenation between them is applied, achieve higher accuracy with respect to the other layers 

In [None]:
# Each layer vector is 768 values, so `cat_vec` is length 3,072.
cat_vec = torch.cat((encoder_9[0],encoder_10[0], encoder_11[0], encoder_12[0]), dim=1)    
cat_vec_first = torch.cat((encoder_1[0],encoder_2[0], encoder_3[0], encoder_4[0]), dim=1) 

In [None]:
cos(torch.mean(cat_vec_first, dim = 0).view(1,-1), torch.mean(cat_vec, dim = 0).view(1,-1))

In [None]:
sum_last_four = encoder_12[0] + encoder_11[0] + encoder_10[0] + encoder_9[0]

In [None]:
sum_first_four =  encoder_1[0] + encoder_2[0] + encoder_3[0] + encoder_4[0]

In [None]:
sum_middle_four = encoder_5[0] + encoder_6[0] + encoder_7[0] + encoder_8[0]

In [None]:
mean_last_four = torch.mean(sum_last_four, dim = 0).view(1,-1)

### How sentence embeddings are created: 

In [None]:
# Calculate the average of all n tokens of the lastr hidden layer
sentence_embedding = torch.mean(encoder_12[0], dim=0)

###### Generate 12 tensor for the sentence embedding, each for encoder layer output

In [None]:
cos = torch.nn.CosineSimilarity()

In [None]:
cat_vec = torch.cat((sentence_emb_9,sentence_emb_10, sentence_emb_11, sentence_emb_12), dim=1)    
cat_vec_first = torch.cat((sentence_emb_1 ,sentence_emb_2, sentence_emb_3, sentence_emb_4), dim=1) 

### Extract all features from the dataset 

### Extract each sentence embedding vector for each hidden state layer

In [None]:
data = data.reset_index()

In [None]:
input_ids = [torch.tensor([tokenizer.encode(i, truncation=True, max_length=70)]) for i in data.text]

In [None]:
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12
        # B x C x H x W, 1 x 4 x 1 x 768
        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
        
        torch.save(sub_layers_initial.float().clone(), '../data/new_approach/train/sarcasm/id_{}_init_{}.pt'.format(i, data.label.iloc[i]))
        torch.save(sub_layers_middle.float().clone(), '../data/new_approach/train/sarcasm/id_{}_middle_{}.pt'.format(i, data.label.iloc[i]))
        torch.save(sub_layers_last.float().clone(), '../data/new_approach/train/sarcasm/id_{}_last_{}.pt'.format(i, data.label.iloc[i]))

### Word embedding layers

In [None]:
y_target = []
#batch_last = torch.zeros((len(input_ids),4,768,50))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) 
        sentence_emb_9 = zero_padding(features[2][9])
        sentence_emb_10 = zero_padding(features[2][10])
        sentence_emb_11 = zero_padding(features[2][11])
        sentence_emb_12 = zero_padding(features[2][12]) 
        
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1)
        if i <= 15000:
            torch.save(sub_layers_last.float().clone(), '../data/new_approach/validation/sarcasm_word/features/{}_.pt'.format(i))
            torch.save(torch.tensor(data.label.iloc[i], dtype = torch.float), '../data/new_approach/validation/sarcasm_word/labels/{}_.pt'.format(i))
        else:
            torch.save(sub_layers_last.float().clone(), '../data/new_approach/train/sarcasm_word/features/{}_.pt'.format(i))
            torch.save(torch.tensor(data.label.iloc[i], dtype = torch.float), '../data/new_approach/train/sarcasm_word/labels/{}_.pt'.format(i)) 

## Proposed method

In [None]:
class simple_attention(nn.Module):
    def __init__(self):
        super(simple_attention, self).__init__()
        
        self.conv_att = AugmentedConv(in_channels=4, out_channels=256, kernel_size=2, dk=3, dv=3, Nh=3, relative=False, stride=2)
        self.pooling1 = nn.AvgPool3d(kernel_size=(1,1,1), stride = (2,1,2))
        self.conv1 = nn.Conv2d(in_channels = 128, out_channels=64, kernel_size=1, stride = 1)
        self.pooling2 =  nn.AvgPool3d(kernel_size=(1,1,2), stride = (2,1,2))
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels=16, kernel_size=1, stride = 2)
        self.max_pool = nn.MaxPool3d(kernel_size=(1,1,3), stride = (1,1,3))
        self.bgru = nn.GRU(input_size=3328, hidden_size=64, num_layers=1, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64,32)
        self.fc3 = nn.Linear(32,16)
        self.fc4 = nn.Linear(16,1)
    
    def forward(self, input1):
        conv_atten = self.conv_att(input1)
        conv_pooled1 = self.pooling1(conv_atten)
        conv_simple = self.conv1(conv_pooled1)
        conv_pooled2 = self.pooling2(conv_simple)
        conv_simple = self.conv2(conv_pooled2)
        conv_max = self.max_pool(conv_simple)
        flatten = torch.flatten(conv_max).reshape(conv_max.size(0), 1, 3328)
        gru = self.bgru(flatten)
        dense1 = F.relu(self.fc1(gru))
        dense2 = F.relu(self.fc2(dense1))
        dense3 = F.relu(self.fc3(dense2))
        output = self.fc4(dense3)
    
        return output

In [None]:
# class ConvolutionalAttention_boosted(nn.Module):
    
#     def __init__(self):
        
#         super(ConvolutionalAttention_boosted, self).__init__()
        
#         self.conv_att = AugmentedConv(in_channels=4, out_channels=256, kernel_size=3, dk=84, dv=12, Nh=12, relative=False, stride=1)
#         self.pooling1 = nn.AvgPool3d(kernel_size=(1,1,1), stride = (2,1,2))
#         self.drop4 = nn.Dropout(0.2)
#         self.conv1 = nn.Conv2d(in_channels = 128, out_channels=128, kernel_size=1, stride = 2)
#         self.drop3 = nn.Dropout(0.3)
#         self.pooling2 = nn.AvgPool3d(kernel_size=(1,1,2), stride = (1,1,2))
#         self.conv2 =  nn.Conv2d(in_channels = 128, out_channels=128, kernel_size=1, stride = 2)
#         self.max = nn.MaxPool3d(kernel_size=(1,1,2), stride = (1,1,2))
#         self.conv3 = nn.Conv2d(in_channels = 128, out_channels=64, kernel_size=1, stride = 2)
#         self.max2 = nn.MaxPool3d(kernel_size=(64,1,1), stride = (16,1,1))
#         self.bgru = nn.GRU(input_size=24, hidden_size=32, num_layers=1, batch_first=True, bidirectional=True)
#         self.drop2 = nn.Dropout(0.3)
#         self.fc1 = nn.Linear(192, 32)
#         self.drop1 =  nn.Dropout(0.5)
#         self.fc_out = nn.Linear(32, 1)
        
#     def forward(self, input1, input2, input3):
        
#         sub_layer_conv = self.conv_att(input1)
#         midd_layer_conv = self.conv_att(input2)
#         high_layer_conv = self.conv_att(input3)

#         sub_layer_pool = self.pooling1(sub_layer_conv)
#         midd_layer_pool = self.pooling1(midd_layer_conv)
#         high_layer_pool = self.pooling1(high_layer_conv)
        
#         drop_pool_sub = self.drop4(sub_layer_pool)
#         drop_pool_mid = self.drop4(sub_layer_pool)
#         drop_pool_last = self.drop4(sub_layer_pool)
        
#         sub_layer_conv = self.conv1(drop_pool_sub)
#         midd_layer_conv = self.conv1(drop_pool_mid)
#         high_layer_conv = self.conv1(drop_pool_last)
        
#         drop_sub = self.drop3(sub_layer_conv)
#         drop_mid = self.drop3(midd_layer_conv)
#         drop_las = self.drop3(high_layer_conv)
        
#         sub_layer = self.pooling2(drop_sub)
#         midd_layer = self.pooling2(drop_mid)
#         high_layer = self.pooling2(drop_las)
        
                               
#         sub_conv2 = self.conv2(sub_layer)
#         mid_conv2 = self.conv2(midd_layer)
#         high_conv2 = self.conv2(high_layer)
                               
                               
#         sub_max = self.max(sub_conv2)
#         mid_max = self.max(mid_conv2)
#         high_max = self.max(high_conv2)
                               
#         sub_conv3 = self.conv3(sub_max)
#         mid_conv3 = self.conv3(mid_max)
#         hig_conv3 = self.conv3(high_max)
        
#         sub_max2 = self.max2(sub_conv3)
#         mid_max2 = self.max2(mid_conv3)
#         high_max2 = self.max2(hig_conv3)

#         gru1_out, gru1_hidden = self.bgru(sub_max2.reshape(sub_max2.size(0),1,sub_max2.size(-1)))
#         gru2_out, gru2_hidden = self.bgru(mid_max2.reshape(mid_max2.size(0),1,mid_max2.size(-1)))
#         gru3_out, gru3_hidden = self.bgru(high_max2.reshape(high_max2.size(0),1,high_max2.size(-1)))
        
#         combined = torch.cat((gru1_out, gru2_out, gru3_out), dim=2)
        
#         drop_comb = self.drop2(combined)
#         dense1 = F.relu(self.fc1(drop_comb))
#         drop = self.drop1(dense1)
#         dense2 = self.fc_out(drop)
        
#         return dense2

In [None]:
class ConvolutionalAttention(nn.Module):
    
    def __init__(self):
        
        super(ConvolutionalAttention, self).__init__()
        
        self.conv_att = AugmentedConv(in_channels=4, out_channels=32, kernel_size=3, dk=42, dv=6, Nh=6, relative=False, stride=1)
        self.pooling1 = nn.AvgPool3d(kernel_size=(1,1,1), stride = (2,1,1))
        self.drop4 = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(in_channels = 128, out_channels=128, kernel_size=1, stride = 2)
        self.drop3 = nn.Dropout(0.3)
        self.pooling2 = nn.AvgPool3d(kernel_size=(1,1,1), stride = (2,1,1))
        self.conv2 =  nn.Conv2d(in_channels = 64, out_channels=128, kernel_size=1, stride = 2)
        self.bgru = nn.GRU(input_size=768, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)
        self.drop2 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(768, 128)
        self.drop1 =  nn.Dropout(0.5)
        self.fc_out = nn.Linear(128, 1)
        
    def forward(self, input1, input2, input3):
        
        sub_layer_conv = self.conv_att(input1)
        midd_layer_conv = self.conv_att(input2)
        high_layer_conv = self.conv_att(input3)

        sub_layer_pool = self.pooling1(sub_layer_conv)
        midd_layer_pool = self.pooling1(midd_layer_conv)
        high_layer_pool = self.pooling1(high_layer_conv)
        
        drop_pool_sub = self.drop4(sub_layer_pool)
        drop_pool_mid = self.drop4(sub_layer_pool)
        drop_pool_last = self.drop4(sub_layer_pool)
        
        sub_layer_conv = self.conv1(drop_pool_sub)
        midd_layer_conv = self.conv1(drop_pool_mid)
        high_layer_conv = self.conv1(drop_pool_last)
        
        drop_sub = self.drop3(sub_layer_conv)
        drop_mid = self.drop3(midd_layer_conv)
        drop_las = self.drop3(high_layer_conv)
        
        sub_layer = self.pooling2(drop_sub)
        midd_layer = self.pooling2(drop_mid)
        high_layer = self.pooling2(drop_las)
        
        gru1_out, gru1_hidden = self.bgru(sub_layer.reshape(sub_layer.size(0),1,768))
        gru2_out, gru2_hidden = self.bgru(midd_layer.reshape(midd_layer.size(0),1,768))
        gru3_out, gru3_hidden = self.bgru(high_layer.reshape(high_layer.size(0),1,768))
        
        combined = torch.cat((gru1_out, gru2_out, gru3_out), dim=2)
        
        drop_comb = self.drop2(combined)
        dense1 = F.relu(self.fc1(drop_comb))
        drop = self.drop1(dense1)
        dense2 = self.fc_out(drop)
        
        return dense2

In [None]:
# class ConvolutionalAttention_light(nn.Module):
    
#     def __init__(self):
        
#         super(ConvolutionalAttention_light, self).__init__()
        
#         self.conv_att = AugmentedConv(in_channels=4, out_channels=256, kernel_size=3, dk=42, dv=6, Nh=6, relative=False, stride=1)
#         self.pooling1 = nn.AvgPool3d(kernel_size=(1,1,1), stride = (2,1,1))
#         #self.drop4 = nn.Dropout(0.2)
#         self.conv1 = nn.Conv2d(in_channels = 128, out_channels=128, kernel_size=1, stride = 2)
#         self.drop3 = nn.Dropout(0.4)
#         self.pooling2 = nn.AvgPool3d(kernel_size=(1,1,1), stride = (2,1,1))
#         self.conv_att2 = AugmentedConv(in_channels=64, out_channels=64, kernel_size=3, dk=42, dv=6, Nh=6, relative=False, stride=1)
#         self.max = nn.MaxPool3d(kernel_size=(1,1,2), stride = (64,1,4))
#         self.bgru = nn.GRU(input_size=96, hidden_size=64, num_layers=1, batch_first=True, bidirectional=True)
#         self.drop2 = nn.Dropout(0.4)
#         self.fc1 = nn.Linear(128, 64)
#         #self.fc2 = nn.Linear(128, 64)
#         #self.fc3 = nn.Linear(64,32)
#         self.drop1 =  nn.Dropout(0.5)
#         self.fc_out = nn.Linear(64, 1)
        
#     def forward(self, input1):
        
#         sub_layer_conv = self.conv_att(input1)

#         sub_layer_pool = self.pooling1(sub_layer_conv)
        
#         drop_pool_sub = self.drop3(sub_layer_pool) 
        
#         sub_layer_conv = self.conv1(sub_layer_pool)
        
#         drop_sub = self.drop3(sub_layer_conv)
        
#         sub_layer = self.pooling2(drop_sub)
        
#         sub_layer_att = self.conv_att2(sub_layer)
        
#         max_pool = self.max(sub_layer_att)

#         gru1_out, gru1_hidden = self.bgru(max_pool.reshape(max_pool.size(0),1,96))
        
#         drop_comb = self.drop2(gru1_out)
#         dense1 = F.relu(self.fc1(drop_comb))
#         drop = self.drop1(dense1)
#         #layer = F.relu(self.fc2(drop))
#         #drop_f = self.drop1(layer)
#         #layer2 = F.relu(self.fc3(drop))
#         #drop_f2 = self.drop1(layer2)
#         dense2 = self.fc_out(drop)
        
#         return dense2

In [None]:
mymodel = simple_attention()
mymodel.to(device)

In [None]:
pytorch_total_params = sum(p.numel() for p in mymodel.parameters())

## Dataloader

In [None]:
ROOT_DIR = os.path.dirname(os.path.abspath('.')) 
data_dir = ROOT_DIR + '\\{}\\{}\\{}\\{}\\'.format('data', 'new_approach', 'train', 'sarcasm_word')

In [None]:
features_dir = data_dir + 'features\\'
label_dir = data_dir + 'labels\\'

In [None]:
def sort_number(elem):
    return int(re.findall('(\d*)(_.pt)', elem)[0][0])

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, root_feat, root_label):
        self.files = os.listdir(root_feat)
        self.labels = os.listdir(root_label)
        self.files.sort(reverse=False, key=sort_number)
        self.labels.sort(reverse=False, key=sort_number)
        
        self.root_feat = root_feat
        self.root_label = root_label
        
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        sample = torch.load(os.path.join(self.root_feat, self.files[idx])) # load the features of this sample
        label = torch.load(os.path.join(self.root_label, self.labels[idx]))
        return sample, label

In [None]:
dataset = MyDataset(features_dir,label_dir)
trainloader = torch.utils.data.DataLoader(dataset,shuffle=True,batch_size=1,num_workers=0)

In [None]:
data_iter = iter(trainloader)

In [None]:
a, b = next(data_iter)

In [None]:
# def find_file(data_dir, idx):
#     for i in os.listdir(data_dir):
#         if i.startswith('id_{}_init_'.format(idx)):
#             tensor_init = torch.load(data_dir+i)
#             y = int(re.findall('(\w{4,10})_(\d)', i)[0][1])
#         if i.startswith('id_{}_middle_'.format(idx)):
#             tensor_middle = torch.load(data_dir+i)
#         if i.startswith('id_{}_last_'.format(idx)):
#             tensor_last = torch.load(data_dir+i)
#     yield tensor_init, tensor_middle, tensor_last, y

In [None]:
# def Dataloader(data_dir, batch_size):
#     folder = os.listdir(data_dir)
#     n_batches_per_epoch = int(len(folder)/3)//batch_size
#     for i in range(n_batches_per_epoch):
#         idx = list(range(int(len(folder)/3))[batch_size*i:batch_size*(i+1)])
#         idx_batch = range(batch_size)
#         batch_initial = torch.zeros((batch_size,4,1,768))
#         batch_middle = torch.zeros((batch_size,4,1,768))
#         batch_last = torch.zeros((batch_size,4,1,768))
#         y_target = []
#         for j in idx_batch:
#             try:
#                 tensor_init, tensor_middle, tensor_last, y = next(find_file(data_dir, idx[j]))
#                 y_target.append(y)
#                 batch_initial[j,:] = tensor_init
#                 batch_middle[j,:] = tensor_middle
#                 batch_last[j,:] = tensor_last
#             except StopIteration:
#                 batch_initial = batch_initial[:j,:]
#                 batch_middle = batch_middle[:j,:]
#                 batch_last = batch_last[:j,:]
#                 break
#         ground_truth = torch.tensor(y_target, dtype = torch.float)
#         y_target = []
#         yield batch_initial, batch_middle, batch_last, torch.unsqueeze(ground_truth,1)

In [None]:
# #data loader with tenor on ram 
# def ramloader(batch_size, ground_truth, batch_initial,batch_middle,batch_last):
#     n_batches_per_epoch = ground_truth.shape[0]//batch_size
#     for i in range(n_batches_per_epoch):
#         idx = list(range(ground_truth.shape[0])[batch_size*i:batch_size*(i+1)])
#         try:
#             y_target = ground_truth[idx]
#             batch_init = batch_initial[idx,:]
#             batch_mid = batch_middle[idx,:]
#             batch_la = batch_last[idx, :]
#         except StopIteration:
#             batch_init = batch_initial[:idx[-1]+1,:]
#             batch_mid = batch_middle[:idx[-1]+1,:]
#             batch_la = batch_last[:idx[-1]+1,:]
#             break    
#         yield batch_init, batch_mid, batch_la, torch.unsqueeze(y_target,1)

In [None]:
# def ramloader_light(batch_size, ground_truth,batch_middle):
#     n_batches_per_epoch = ground_truth.shape[0]//batch_size
#     for i in range(n_batches_per_epoch):
#         idx = list(range(ground_truth.shape[0])[batch_size*i:batch_size*(i+1)])
#         try:
#             y_target = ground_truth[idx]
#             batch_mid = batch_middle[idx,:]
#         except StopIteration:
#             batch_mid = batch_middle[:idx[-1]+1,:]
#             break    
#         yield batch_mid, torch.unsqueeze(y_target,1)

In [None]:
# import time 
# start_time = time.time()
# batch_initial, batch_middle, batch_last, y = next(Dataloader(data_dir, 16))
# print("--- Batch 16: %s seconds ---" % (time.time() - start_time))

In [None]:
# start_time = time.time()
# batch_initial, batch_middle, batch_last, y = next(Dataloader(data_dir, 32))
# print("--- Batch 32: %s seconds ---" % (time.time() - start_time))

In [None]:
# start_time = time.time()
# batch_initial, batch_middle, batch_last, y = next(Dataloader(data_dir, 64))
# print("--- Batch 64: %s seconds ---" % (time.time() - start_time))

### Load all on ram

In [None]:
y_target = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12
        # B x C x H x W, 1 x 4 x 1 x 768
        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_target.append(data.label.iloc[i])

ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
# torch.save(batch_initial.float().clone(), '../data/new_approach/train/sarcasm/init_layer.pt')
# torch.save(batch_middle.float().clone(), '../data/new_approach/train/sarcasm/middle_layer.pt')
# torch.save(batch_last.float().clone(), '../data/new_approach/train/sarcasm/last_layer.pt')
# torch.save(ground_truth.float().clone(), '../data/new_approach/train/sarcasm/y_train.pt')

## Training phase

In [None]:
batch_initial = torch.load('../data/new_approach/train/sarcasm/init_layer.pt')
batch_middle = torch.load( '../data/new_approach/train/sarcasm/middle_layer.pt')
batch_last = torch.load('../data/new_approach/train/sarcasm/last_layer.pt')
ground_truth = torch.load('../data/new_approach/train/sarcasm/y_train.pt')

In [None]:
batch_initial_train =  batch_initial[15713:]
batch_middle_train = batch_middle[15713:]
batch_last_train = batch_last[15713:]
ground_truth_train = ground_truth[15713:]

In [None]:
batch_initial_val = batch_initial[:15713]
batch_middle_val = batch_middle[:15713]
batch_last_val = batch_last[:15713]
ground_truth_val = ground_truth[:15713]

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(mymodel.parameters(), lr=0.001)

In [None]:
from sklearn.metrics import accuracy_score
def get_accuracy(output, actual):
    """
    Return the accuracy of the model on the input data and actual ground truth.
    """
    prob = torch.sigmoid(output)
    pred = torch.squeeze((prob > 0.50).type(torch.FloatTensor))
    accuracy = accuracy_score(pred.cpu(), torch.squeeze(actual).cpu())
    return accuracy

In [None]:
def normalize_pred(pred):
    numpy_list = [i.numpy() for i in pred]
    numpy_1vec = np.concatenate(numpy_list).ravel()
    return numpy_1vec

### All layers featuers

In [None]:
mymodel.train()
accuracy_epoch = []
loss_epoch = []
for epoch in range(10):  # loop over the dataset multiple times
    trainloader = ramloader(4, ground_truth_train, batch_initial_train,batch_middle_train,batch_last_train)
    accuracy_step = []
    loss_step = []
    for i, data in enumerate(trainloader):

        layer_init = data[0].to(device)
        layer_middle = data[1].to(device)
        layer_high = data[2].to(device)
        labels = data[3].to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = mymodel(layer_init, layer_middle, layer_high)
        loss = criterion(outputs, torch.unsqueeze(labels, -1))
        loss.backward()
        optimizer.step() 
        stepsize = int(ground_truth_train.shape[0]//4)
        accuracy = get_accuracy(outputs, labels)
        accuracy_step.append(accuracy)
        loss_step.append(loss)
        print('Epoch {}, Step {}/{}, Loss: {}, Accuracy: {}'.format(epoch,i,stepsize, loss, accuracy), end = '\r')
    mean_accuracy = np.mean(accuracy_step)
    accuracy_epoch.append(mean_accuracy)
    loss_epoch.append(loss_step)
print('Finished Training')

In [None]:
with torch.no_grad():
    prediction_val = []
    valoader = ramloader(16, ground_truth_val, batch_initial_val, batch_middle_val, batch_last_val)
    accuracy_step = []
    loss_step = []
    for i, data in enumerate(valoader):

        layer_init = data[0].to(device)
        layer_mid = data[1].to(device)
        layer_last = data[2].to(device)
        labels = data[3].to(device)

        outputs = mymodel(layer_init, layer_mid, layer_last)
        accuracy = get_accuracy(outputs, labels)
        prob = torch.sigmoid(outputs)
        pred = torch.squeeze((prob > 0.50).type(torch.FloatTensor))
        prediction_val.append(pred.cpu())
        accuracy_step.append(accuracy)
    mean_accuracy = np.mean(accuracy_step)

In [None]:
print(classification_report(normalize_pred(prediction_val), ground_truth_val[:15712]))

### Less features

In [None]:
mymodel.train()
accuracy_epoch = []
loss_epoch = []
for epoch in range(10):  # loop over the dataset multiple times
    trainloader = ramloader_light(32, ground_truth_train, batch_last_train)
    accuracy_step = []
    loss_step = []
    for i, data in enumerate(trainloader):

        layer_init = data[0].to(device)
        labels = data[1].to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = mymodel(layer_init)
        loss = criterion(outputs, torch.unsqueeze(labels, -1))
        loss.backward()
        optimizer.step() 
        stepsize = int(ground_truth.shape[0]//32)
        accuracy = get_accuracy(outputs, labels)
        accuracy_step.append(accuracy)
        loss_step.append(loss)
        print('Epoch {}, Step {}/{}, Loss: {}, Accuracy: {}'.format(epoch,i,stepsize, loss, accuracy), end = '\r')
    mean_accuracy = np.mean(accuracy_step)
    accuracy_epoch.append(mean_accuracy)
    loss_epoch.append(loss_step)
print('Finished Training')

In [None]:
with torch.no_grad():
    prediction_val = []
    valoader = ramloader_light(16, ground_truth_val,batch_last_val)
    accuracy_step = []
    loss_step = []
    for i, data in enumerate(valoader):

        layer_init = data[0].to(device)
        labels = data[1].to(device)

        outputs = mymodel(layer_init)
        accuracy = get_accuracy(outputs, labels)
        prob = torch.sigmoid(outputs)
        pred = torch.squeeze((prob > 0.50).type(torch.FloatTensor))
        prediction_val.append(pred.cpu())
        accuracy_step.append(accuracy)
    mean_accuracy = np.mean(accuracy_step)

In [None]:
print(classification_report(normalize_pred(prediction_val), ground_truth_val[:15712]))

### Test set

In [None]:
test = pd.read_csv('../data/Riloff_twitter/riloff_sarc_train_test.csv')

In [None]:
input_ids = [torch.tensor([tokenizer.encode(i, truncation=True, max_length=50)]) for i in test.text]

In [None]:
# with torch.no_grad():
    
#     for i in tqdm(range(len(input_ids))):
        
#         features = bertweet(input_ids[i]) 
#         sentence_emb_9 = zero_padding(features[2][9])
#         sentence_emb_10 = zero_padding(features[2][10])
#         sentence_emb_11 = zero_padding(features[2][11])
#         sentence_emb_12 = zero_padding(features[2][12]) 

#         sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1)
        
#         torch.save(sub_layers_last.float().clone(), '../data/new_approach/test/sarcasm_word/features/{}_.pt'.format(i))
#         torch.save(torch.tensor(test.labels.iloc[i], dtype = torch.float), '../data/new_approach/test/sarcasm_word/labels/{}_.pt'.format(i)) 

In [None]:
y_test = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12
        # B x C x H x W, 1 x 4 x 1 x 768
        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_test.append(test.labels.iloc[i])

ground_test = torch.tensor(y_test, dtype = torch.float)  

In [None]:
with torch.no_grad():
    prediction_val = []
    valoader = ramloader(16, ground_test,batch_initial, batch_middle, batch_last)
    accuracy_step = []
    loss_step = []
    for i, data in enumerate(valoader):

        layer_init = data[0].to(device)
        layer_mid = data[1].to(device)
        layer_last = data[2].to(device)
        labels = data[3].to(device)

        outputs = mymodel(layer_init, layer_mid, layer_last)
        accuracy = get_accuracy(outputs, labels)
        prob = torch.sigmoid(outputs)
        pred = torch.squeeze((prob > 0.50).type(torch.FloatTensor))
        prediction_val.append(pred.cpu())
        accuracy_step.append(accuracy)
    mean_accuracy = np.mean(accuracy_step)

### Less feautures test

In [None]:
with torch.no_grad():
    valoader = ramloader_light(2, ground_test,batch_last)
    accuracy_step = []
    loss_step = []
    prediction_test = []
    for i, data in enumerate(valoader):

        layer_init = data[0].to(device)
        labels = data[1].to(device)
       
        outputs = mymodel(layer_init)
        accuracy = get_accuracy(outputs, labels)
        
        prob = torch.sigmoid(outputs)
        pred = torch.squeeze((prob > 0.5).type(torch.FloatTensor))
        prediction_test.append(pred.cpu())
        accuracy_step.append(accuracy)
    mean_accuracy = np.mean(accuracy_step)

In [None]:

print(classification_report(normalize_pred(prediction_val), test.labels[:1952]))

In [None]:
f1_score(normalize_pred(prediction_val), test.labels[:1952], average='weighted')