In [None]:
from transformers import AutoModel, AutoTokenizer 
import torch
import pickle 
import numpy as np
import pandas as pd 
import re
from tqdm import tqdm
import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from torch import nn
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn.functional as F
import torch.optim as optim
import time
from sklearn.metrics import classification_report
from ark_tweet_pos import CMUTweetTagger
import shlex
run_tagger_cmd = "java -XX:ParallelGCThreads=10 -Xmx500m -jar ark_tweet_pos/ark-tweet-nlp-0.3.2.jar"
import FeaturesText
import wandb
wandb.login()
#f916c4a8279e06f6c75fb2a86d88784b94e8a539

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base",output_hidden_states=True)
tokenizer_bert = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)

In [None]:
data = pd.read_csv(r'D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/data/final_sarc_trainingset_twitter.csv')

## Extract pos and embeddings layers from bert-tweet (training)

In [None]:
data['text'] = data['text'].str.replace(r'#([^\s:]+)', '')
data = data.reset_index()

In [None]:
txt_file = FeaturesText.preprocessing_text(data,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)
start = time.time()
train_txt = txt_file.get_clean_df()
end = time.time()
print(end - start)

In [None]:
train_txt = train_txt[train_txt.astype(str).pos != '[]']

In [None]:
final_train_txt = FeaturesText.ExtractFeatures(train_txt, 'other', svd_transform=False)

In [None]:
pos, punctuation_features, emoji_features, \
onomato_features, initialism_features,\
polarity_subj_features = final_train_txt.get_all_features_train(ngram_range=(1,1), dimensionality=100)

In [None]:
train_features = {'pos': pos,'polarity':polarity_subj_features, 'emoji': emoji_features,'punc': punctuation_features, 
                                'onom': onomato_features, 'init': initialism_features, 'label': np.asarray(train_txt.label.tolist())}

In [None]:
import pickle
with open('../data/features_training_sarc_twitter_new_approach.p', 'wb') as fp:
    pickle.dump(train_features, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = pd.read_csv(r'D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/data/final_sarc_trainingset_twitter_cleaned.csv')

In [None]:
df['list'] = df.text.apply(lambda x: x.split(' '))
df['len_list'] = df.list.str.len()
df = df[df.len_list > 4]

In [None]:
df.reset_index(inplace=True)

In [None]:
import ast

In [None]:
ln = np.array([len(ast.literal_eval(i)) for i in df.pos])

In [None]:
import matplotlib.pyplot as plt
plt.hist(ln)

In [None]:
tokenizer = Tokenizer(num_words=30)
tokenizer.fit_on_texts(train_txt['pos'].astype(str))
sequences_pos = tokenizer.texts_to_sequences(train_txt['pos'].astype(str))

In [None]:
data_pos = pad_sequences(sequences_pos, maxlen=30, padding='post', truncating='post')
pos_tensor = torch.unsqueeze(torch.tensor(data_pos, dtype=torch.float),1)
torch.save(pos_tensor.float().clone(), '../data/new_approach/train/sarcasm/pos_tensor.pt')

# Data augmentation

In [None]:
from textattack.augmentation import WordNetAugmenter
from textattack.augmentation import EmbeddingAugmenter

In [None]:
augmenter = WordNetAugmenter(pct_words_to_swap=0.6)

In [None]:
label_list = []
text_aug = []
for i in tqdm(range(len(df))): 
        if i >= 7000:
            aug = augmenter.augment(df.text.iloc[i])
            label = df.label.iloc[i]
            text_aug.append(' '.join(map(str, aug)))
            label_list.append(label)
        else:
            pass

In [None]:
augmented = pd.DataFrame(text_aug, columns=['text'])

In [None]:
augmented['label'] = label_list

In [None]:
df = df[['text', 'label']]

In [None]:
train_txt = pd.concat([df, augmented]).reset_index()

In [None]:
train_txt.to_csv('../data/new_approach/augmented_sarcasm_training.csv')

In [None]:
train_txt.drop('index', axis = 1, inplace=True)

In [None]:
input_ids = [torch.tensor([tokenizer_bert.encode(i)]) for i in df.text]

### Sentence layer

In [None]:
batch_sentence = torch.zeros((len(input_ids),1,768))
y_target = []
with torch.no_grad():
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        batch_sentence[i, :] = features[1]
        y_target.append(train_txt.label.iloc[i])
        
ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_sentence.float().clone(), '../data/new_approach/train/sarcasm/sentence_layer.pt')
torch.save(ground_truth.float().clone(), '../data/new_approach/train/sarcasm/y_train_sentence.pt')

In [None]:
y_target = []
y_val = []
batch_initial = torch.zeros((len(input_ids)-7000,4,1,768))
batch_middle = torch.zeros((len(input_ids)-7000,4,1,768))
batch_last = torch.zeros((len(input_ids)-7000,4,1,768))

batch_initial_val = torch.zeros((7000,4,1,768))
batch_middle_val = torch.zeros((7000,4,1,768))
batch_last_val = torch.zeros((7000,4,1,768))
index = 0
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12

        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
        if i < 7000:
            
            batch_initial_val[i,:] = sub_layers_initial
            batch_middle_val[i,:] = sub_layers_middle
            batch_last_val[i,:] = sub_layers_last
            y_val.append(train_txt.label.iloc[i])
        else:
            batch_initial[index,:] = sub_layers_initial
            batch_middle[index,:] = sub_layers_middle
            batch_last[index,:] = sub_layers_last
        
            y_target.append(df.label.iloc[i])
            index += 1
            
ground_val = torch.tensor(y_val, dtype = torch.float)             
ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
y_target = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12

        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_target.append(df.label.iloc[i])

ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_initial.float().clone(), '../data/new_approach/train/sarcasm/init_layer.pt')
torch.save(batch_middle.float().clone(), '../data/new_approach/train/sarcasm/middle_layer.pt')
torch.save(batch_last.float().clone(), '../data/new_approach/train/sarcasm/last_layer.pt')
torch.save(ground_truth.float().clone(), '../data/new_approach/train/sarcasm/y_train.pt')

In [None]:
torch.save(batch_initial_val.float().clone(), '../data/new_approach/train/sarcasm_validation/init_layer.pt')
torch.save(batch_middle_val.float().clone(), '../data/new_approach/train/sarcasm_validation/middle_layer.pt')
torch.save(batch_last_val.float().clone(), '../data/new_approach/train/sarcasm_validation/last_layer.pt')
torch.save(ground_val.float().clone(), '../data/new_approach/train/sarcasm_validation/y_train.pt')

## Extract pos and embeddings layers from bert-tweet (test)

In [None]:
riloff = pd.read_csv('../data/Riloff_twitter/riloff_sarc_train_test.csv')
ghosh = pd.read_csv('../data/Ghosh_sarc_tweet/Test_v1.txt', sep = '\t', header = None)

In [None]:
ghosh.rename({0: 'training', 1:'label', 2:'text'}, axis = 1, inplace = True)

In [None]:
test_txt_file_riloff = FeaturesText.preprocessing_text(riloff,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)
test_txt_file_ghosh = FeaturesText.preprocessing_text(ghosh,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)

In [None]:
start = time.time()
test_txt_ril = test_txt_file_riloff.get_clean_df()
end = time.time()
print(end - start)

In [None]:
start = time.time()
test_txt_ghos = test_txt_file_ghosh.get_clean_df()
end = time.time()
print(end - start)

In [None]:
sequences_pos = tokenizer.texts_to_sequences(test_txt_ril['pos'].astype(str))
data_pos = pad_sequences(sequences_pos, maxlen=30, padding='post', truncating='post')
pos_tensor = torch.unsqueeze(torch.tensor(data_pos, dtype=torch.float),1)
torch.save(pos_tensor.float().clone(), '../data/new_approach/test/sarcasm/pos_tensor_riloff.pt')

In [None]:
sequences_pos = tokenizer.texts_to_sequences(test_txt_ghos['pos'].astype(str))
data_pos = pad_sequences(sequences_pos, maxlen=30, padding='post', truncating='post')
pos_tensor = torch.unsqueeze(torch.tensor(data_pos, dtype=torch.float),1)
torch.save(pos_tensor.float().clone(), '../data/new_approach/test/sarcasm/pos_tensor_ghosh.pt')

In [None]:
input_ids = [torch.tensor([tokenizer_bert.encode(i)]) for i in test_txt_ril.text]

In [None]:
batch_sentence = torch.zeros((len(input_ids),1,768))
y_target = []
with torch.no_grad():
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        batch_sentence[i, :] = features[1]
        y_target.append(train_txt.label.iloc[i])
        
ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_sentence.float().clone(), '../data/new_approach/test/sarcasm/sentence_layer_riloff.pt')

In [None]:
y_target = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12

        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_target.append(test_txt_ril.labels.iloc[i])

ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_initial.float().clone(), '../data/new_approach/test/sarcasm/init_layer_riloff.pt')
torch.save(batch_middle.float().clone(), '../data/new_approach/test/sarcasm/middle_layer_riloff.pt')
torch.save(batch_last.float().clone(), '../data/new_approach/test/sarcasm/last_layer_riloff.pt')
torch.save(ground_truth.float().clone(), '../data/new_approach/test/sarcasm/y_riloff.pt')

In [None]:
input_ids = [torch.tensor([tokenizer_bert.encode(i, truncation=True, max_length=128)]) for i in test_txt_ghos.text]

In [None]:
batch_sentence = torch.zeros((len(input_ids),1,768))
y_target = []
with torch.no_grad():
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        batch_sentence[i, :] = features[1]
        y_target.append(train_txt.label.iloc[i])
        
ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_sentence.float().clone(), '../data/new_approach/test/sarcasm/sentence_layer_ghosh.pt')

In [None]:
y_target = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12

        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_target.append(test_txt_ghos.label.iloc[i])

ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_initial.float().clone(), '../data/new_approach/test/sarcasm/init_layer_ghosh.pt')
torch.save(batch_middle.float().clone(), '../data/new_approach/test/sarcasm/middle_layer_ghosh.pt')
torch.save(batch_last.float().clone(), '../data/new_approach/test/sarcasm/last_layer_ghosh.pt')
torch.save(ground_truth.float().clone(), '../data/new_approach/test/sarcasm/y_ghosh.pt')