In [None]:
from transformers import AutoModel, AutoTokenizer 
import torch
import pickle 
import numpy as np
import pandas as pd 
import re
from tqdm import tqdm
import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from torch import nn
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn.functional as F
import torch.optim as optim
import time
from sklearn.metrics import classification_report
from ark_tweet_pos import CMUTweetTagger
import shlex
run_tagger_cmd = "java -XX:ParallelGCThreads=10 -Xmx500m -jar ark_tweet_pos/ark-tweet-nlp-0.3.2.jar"
import FeaturesText
import wandb
wandb.login()

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base",output_hidden_states=True)
tokenizer_bert = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True, truncation = True, max_lenth = 60)

In [None]:
data = pd.read_csv('../data/final_training_semeval.csv')

## Extract pos and embeddings layers from bert-tweet (training)

In [None]:
data['text'] = data['text'].str.replace(r'#([^\s:]+)', '')
data = data.reset_index()

In [None]:
txt_file = FeaturesText.preprocessing_text(data,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)
start = time.time()
train_txt = txt_file.get_clean_df()
end = time.time()
print(end - start)

In [None]:
train_txt = train_txt[train_txt.astype(str).pos != '[]']

In [None]:
final_train_txt = FeaturesText.ExtractFeatures(train_txt, 'other', svd_transform=False)

In [None]:
pos, punctuation_features, emoji_features, \
onomato_features, initialism_features,\
polarity_subj_features = final_train_txt.get_all_features_train(ngram_range=(1,1), dimensionality=100)

In [None]:
train_features = {'pos': pos,'polarity':polarity_subj_features, 'emoji': emoji_features,'punc': punctuation_features, 
                                'onom': onomato_features, 'init': initialism_features, 'label': np.asarray(train_txt.label.tolist())}

In [None]:
import pickle
with open('../data/features_training_sarc_twitter_new_approach_semeval.p', 'wb') as fp:
    pickle.dump(train_features, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
data['list'] = data.text.apply(lambda x: x.split(' '))
data['len_list'] = data.list.str.len()
data = data[data.len_list > 4]

In [None]:
import ast

In [None]:
#ln = np.array([len(ast.literal_eval(i)) for i in data.pos])

In [None]:
# import matplotlib.pyplot as plt
# plt.hist(ln)

In [None]:
tokenizer = Tokenizer(num_words=30)
tokenizer.fit_on_texts(train_txt['pos'].astype(str))
sequences_pos = tokenizer.texts_to_sequences(train_txt['pos'].astype(str))

In [None]:
data_pos = pad_sequences(sequences_pos, maxlen=30, padding='post', truncating='post')
pos_tensor = torch.unsqueeze(torch.tensor(data_pos, dtype=torch.float),1)
torch.save(pos_tensor.float().clone(), '../data/new_approach/train/irony/pos_tensor_semeval.pt')

# Data augmentation

In [None]:
from textattack.augmentation import WordNetAugmenter
from textattack.augmentation import EmbeddingAugmenter

In [None]:
augmenter = WordNetAugmenter(pct_words_to_swap=0.6)

In [None]:
label_list = []
text_aug = []
for i in tqdm(range(len(data))): 
        if i >= 5000 and data.label.iloc[i] == 1:
            aug = augmenter.augment(data.text.iloc[i])
            label = data.label.iloc[i]
            text_aug.append(' '.join(map(str, aug)))
            label_list.append(label)
        else:
            pass

In [None]:
augmented = pd.DataFrame(text_aug, columns=['text'])

In [None]:
augmented['label'] = label_list

In [None]:
data = data[['text', 'label']]

In [None]:
train_txt = pd.concat([data, augmented]).reset_index()

In [None]:
train_txt.to_csv('../data/new_approach/augmented_irony_training.csv')

In [None]:
train_txt.drop('index', axis = 1, inplace=True)

In [None]:
input_ids = [torch.tensor([tokenizer_bert.encode(i, truncation=True, padding=True, max_length=60)]) for i in train_txt.text]

### Sentence layer

In [None]:
batch_sentence = torch.zeros((len(input_ids),1,768))
y_target = []
with torch.no_grad():
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        batch_sentence[i, :] = features[1]
        y_target.append(train_txt.label.iloc[i])
        
ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_sentence.float().clone(), '../data/new_approach/train/irony/sentence_layer.pt')
torch.save(ground_truth.float().clone(), '../data/new_approach/train/irony/y_train_sentence.pt')

In [None]:
# y_target = []
# y_val = []
# batch_initial = torch.zeros((len(input_ids)-5000,4,1,768))
# batch_middle = torch.zeros((len(input_ids)-5000,4,1,768))
# batch_last = torch.zeros((len(input_ids)-5000,4,1,768))

# batch_initial_val = torch.zeros((5000,4,1,768))
# batch_middle_val = torch.zeros((5000,4,1,768))
# batch_last_val = torch.zeros((5000,4,1,768))
# index = 0
    
# with torch.no_grad():
    
#     for i in tqdm(range(len(input_ids))):
        
#         features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
#         sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
#         sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
#         sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
#         sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
#         sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
#         sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
#         sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
#         sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
#         sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
#         sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
#         sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
#         sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12

#         sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
#         sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
#         sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
#         if i < 5000:
            
#             batch_initial_val[i,:] = sub_layers_initial
#             batch_middle_val[i,:] = sub_layers_middle
#             batch_last_val[i,:] = sub_layers_last
#             y_val.append(train_txt.label.iloc[i])
#         else:
#             batch_initial[index,:] = sub_layers_initial
#             batch_middle[index,:] = sub_layers_middle
#             batch_last[index,:] = sub_layers_last
        
#             y_target.append(train_txt.label.iloc[i])
#             index += 1
            
# ground_val = torch.tensor(y_val, dtype = torch.float)             
# ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
y_target = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12

        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_target.append(train_txt.label.iloc[i])

ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_initial.float().clone(), '../data/new_approach/train/irony/init_layer_semtask.pt')
torch.save(batch_middle.float().clone(), '../data/new_approach/train/irony/middle_layer_semtask.pt')
torch.save(batch_last.float().clone(), '../data/new_approach/train/irony/last_layer_semtask.pt')
torch.save(ground_truth.float().clone(), '../data/new_approach/train/irony/y_train_semtask.pt')

In [None]:
torch.save(batch_initial_val.float().clone(), '../data/new_approach/train/irony_validation/init_layer.pt')
torch.save(batch_middle_val.float().clone(), '../data/new_approach/train/irony_validation/middle_layer.pt')
torch.save(batch_last_val.float().clone(), '../data/new_approach/train/irony_validation/last_layer.pt')
torch.save(ground_val.float().clone(), '../data/new_approach/train/irony_validation/y_train.pt')

## Extract pos and embeddings layers from bert-tweet (test)

In [None]:
sem = pd.read_csv('../data/SemEval2018-Task3/datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt', sep='\t')
sem.rename({'Tweet text': 'text', 'Label' : 'label'}, axis = 1, inplace=True)

In [None]:
test_txt_file_sem = FeaturesText.preprocessing_text(sem,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)

In [None]:
start = time.time()
test_txt_se = test_txt_file_sem.get_clean_df()
end = time.time()
print(end - start)

In [None]:
sequences_pos = tokenizer.texts_to_sequences(test_txt_ril['pos'].astype(str))
data_pos = pad_sequences(sequences_pos, maxlen=30, padding='post', truncating='post')
pos_tensor = torch.unsqueeze(torch.tensor(data_pos, dtype=torch.float),1)
torch.save(pos_tensor.float().clone(), '../data/new_approach/test/irony/pos_tensor_sem.pt')

In [None]:
input_ids = [torch.tensor([tokenizer_bert.encode(i)]) for i in test_txt_se.text]

In [None]:
batch_sentence = torch.zeros((len(input_ids),1,768))
y_target = []
with torch.no_grad():
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        batch_sentence[i, :] = features[1]
        y_target.append(train_txt.label.iloc[i])
        
ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_sentence.float().clone(), '../data/new_approach/test/irony/sentence_layer_sem.pt')

In [None]:
y_target = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12

        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_target.append(test_txt_se.label.iloc[i])

ground_truth = torch.tensor(y_target, dtype = torch.float)  

In [None]:
torch.save(batch_initial.float().clone(), '../data/new_approach/test/irony/init_layer_sem.pt')
torch.save(batch_middle.float().clone(), '../data/new_approach/test/irony/middle_layer_sem.pt')
torch.save(batch_last.float().clone(), '../data/new_approach/test/irony/last_layer_sem.pt')
torch.save(ground_truth.float().clone(), '../data/new_approach/test/irony/y_sem.pt')