In [None]:
import FeaturesText
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from transformers import AutoModel, AutoTokenizer 
import torch
import pickle 
import numpy as np
import pandas as pd 

In [None]:
data = pd.read_csv('../data/final_sarc_trainingset_twitter.csv')

#### Cleaning Training Set

In [None]:
txt_file = FeaturesText.preprocessing_text(data,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)

In [None]:
start = time.time()
train_txt = txt_file.get_clean_df()
end = time.time()
print(end - start)

In [None]:
train_txt = train_txt[train_txt.astype(str).pos != '[]']

In [None]:
final_train_txt = FeaturesText.ExtractFeatures(train_txt, 'other', svd_transform=False)

### Features extraction

In [None]:
pos, punctuation_features, emoji_features, \
onomato_features, initialism_features,\
polarity_subj_features = final_train_txt.get_all_features_train(ngram_range=(1,1), dimensionality=100)

In [None]:
train_features = {'pos': pos,'polarity':polarity_subj_features, 'emoji': emoji_features,'punc': punctuation_features, 
                                'onom': onomato_features, 'init': initialism_features, 'label': np.asarray(train_txt.label.tolist())}

### Bert tweet embeddings

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base",output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",  add_special_tokens=True,
                                                max_length=70, pad_to_max_length=True,normalization=True, truncation=True, padding= True, return_attention_mask=True)

In [None]:
input_ids = [torch.tensor([tokenizer.encode(i, max_length=50, truncation=True)]) for i in train_txt.text]

In [None]:
from tqdm import tqdm

In [None]:
features_list = []
with torch.no_grad():
    for i in tqdm(input_ids):
        features = np.array(bertweet(i)[1]) #extract sentence embedding 1 x 768 for each document
        features_list.append(features)

In [None]:
bert_emb = np.array(features_list).squeeze()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_rescaled = scaler.fit_transform(bert_emb)
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
pca.fit(data_rescaled)
reduced = pca.transform(data_rescaled)
import joblib 
joblib.dump(scaler, 'scaler_embed_sarcasm.pkl') 
joblib.dump(pca, 'pca_embedding_sarcasm.pkl')

### Save features

In [None]:
train_features['bert_embed'] = reduced

In [None]:
import pickle
with open('../data/features_training_sarc_twitter.p', 'wb') as fp:
    pickle.dump(train_features, fp, protocol=pickle.HIGHEST_PROTOCOL)

### Import test set

In [None]:
riloff = pd.read_csv('../data/Riloff_twitter/riloff_sarc_train_test.csv')
ghosh = pd.read_csv('../data/Ghosh_sarc_tweet/Test_v1.txt', sep = '\t', header = None)

In [None]:
ghosh.rename({0: 'training', 1:'label', 2:'text'}, axis = 1, inplace = True)

In [None]:
test_txt_file_riloff = FeaturesText.preprocessing_text(riloff,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)
test_txt_file_ghosh = FeaturesText.preprocessing_text(ghosh,remove_hashtags=True, remove_mentions=True, lowercase=True, arktweet_pos=True)

In [None]:
start = time.time()
test_txt_ril = test_txt_file_riloff.get_clean_df()
end = time.time()
print(end - start)

In [None]:
start = time.time()
test_txt_ghos = test_txt_file_ghosh.get_clean_df()
end = time.time()
print(end - start)

In [None]:
test_txt_ril['text'] = test_txt_ril.text.str.replace('\n', '')

### Embeddings Output Encoders BERTweet

In [None]:
input_ids = [torch.tensor([tokenizer.encode(i, truncation=True, max_length=70)]) for i in test_txt_ghos.text]

In [None]:
y_test = []
batch_initial = torch.zeros((len(input_ids),4,1,768))
batch_middle = torch.zeros((len(input_ids),4,1,768))
batch_last = torch.zeros((len(input_ids),4,1,768))
    
with torch.no_grad():
    
    for i in tqdm(range(len(input_ids))):
        
        features = bertweet(input_ids[i]) #extract sentence embedding 1 x 768 for each document
        
        sentence_emb_1 = torch.mean(features[2][1], dim=1).view(1, -1) #layer 1 
        sentence_emb_2 = torch.mean(features[2][2], dim=1).view(1, -1)
        sentence_emb_3 = torch.mean(features[2][3], dim=1).view(1, -1)
        sentence_emb_4 = torch.mean(features[2][4], dim=1).view(1, -1)
        sentence_emb_5 = torch.mean(features[2][5], dim=1).view(1, -1)
        sentence_emb_6 = torch.mean(features[2][6], dim=1).view(1, -1)
        sentence_emb_7 = torch.mean(features[2][7], dim=1).view(1, -1)
        sentence_emb_8 = torch.mean(features[2][8], dim=1).view(1, -1)
        sentence_emb_9 = torch.mean(features[2][9], dim=1).view(1, -1)
        sentence_emb_10 = torch.mean(features[2][10], dim=1).view(1, -1)
        sentence_emb_11 = torch.mean(features[2][11], dim=1).view(1, -1)
        sentence_emb_12 = torch.mean(features[2][12], dim=1).view(1, -1) #layer 12
        # B x C x H x W, 1 x 4 x 1 x 768
        sub_layers_initial = torch.stack((sentence_emb_1, sentence_emb_2, sentence_emb_3, sentence_emb_4), dim= 1).reshape(1,4,1,768)  #add batch dimension
        sub_layers_middle = torch.stack((sentence_emb_5, sentence_emb_6, sentence_emb_7, sentence_emb_8), dim= 1).reshape(1,4,1,768)
        sub_layers_last = torch.stack((sentence_emb_9, sentence_emb_10, sentence_emb_11, sentence_emb_12), dim= 1).reshape(1,4,1,768)
              
        batch_initial[i,:] = sub_layers_initial
        batch_middle[i,:] = sub_layers_middle
        batch_last[i,:] = sub_layers_last
        
        y_test.append(test_txt_ghos.label.iloc[i])

ground_test = torch.tensor(y_test, dtype = torch.float)  

In [None]:
torch.save(batch_initial.float().clone(), '../data/new_approach/test/sarcasm/init_layer_riloff.pt')
torch.save(batch_middle.float().clone(), '../data/new_approach/test/sarcasm/middle_layer_riloff.pt')
torch.save(batch_last.float().clone(), '../data/new_approach/test/sarcasm/last_layer_riloff.pt')
torch.save(ground_test.float().clone(), '../data/new_approach/test/sarcasm/y_riloff.pt')

In [None]:
pos_rilof, punctuation_features_rilof, emoji_features_rilof, \
onomato_features_rilof, initialism_features_rilof,\
polarity_subj_features_rilof = FeaturesText.ExtractFeatures.get_all_features_test(final_train_txt, test_set=test_txt_ril)

### Extract other features

In [None]:
pos_ghosh, punctuation_features_ghosh, emoji_features_ghosh, \
onomato_features_ghosh, initialism_features_ghosh,\
polarity_subj_features_ghosh = FeaturesText.ExtractFeatures.get_all_features_test(final_train_txt, test_set=test_txt_ghos)

In [None]:
test_features_ril = {'pos': pos_rilof,'polarity':polarity_subj_features_rilof, 'emoji': emoji_features_rilof,'punc': punctuation_features_rilof, 
                                'onom': onomato_features_rilof, 'init': initialism_features_rilof, 'label': np.asarray(riloff.labels.tolist())}

test_features_ghosh= {'pos': pos_ghosh,'polarity':polarity_subj_features_ghosh, 'emoji': emoji_features_ghosh,'punc': punctuation_features_ghosh, 
                                'onom': onomato_features_ghosh, 'init': initialism_features_ghosh, 'label': np.asarray(ghosh.label.tolist())}

In [None]:
input_ids_ril = [torch.tensor([tokenizer.encode(i, max_length=50, truncation=True)]) for i in riloff.text]
input_ids_ghosh = [torch.tensor([tokenizer.encode(i, max_length=50, truncation=True)]) for i in ghosh.text]

In [None]:
features_list_ril = []
with torch.no_grad():
    for i in tqdm(input_ids_ril):
        features = np.array(bertweet(i)[1]) #extract sentence embedding 1 x 768 for each document
        features_list_ril.append(features)

features_list_ghosh = []
with torch.no_grad():
    for i in tqdm(input_ids_ghosh):
        features = np.array(bertweet(i)[1]) #extract sentence embedding 1 x 768 for each document
        features_list_ghosh.append(features)

In [None]:
bert_emb_ril = np.array(features_list_ril).squeeze()
bert_emb_ghosh = np.array(features_list_ghosh).squeeze()

In [None]:
# later reload the pickle file
import pickle as pk
import joblib
pca_reload = joblib.load("pca_embedding_sarcasm.pkl")
standar_reload = joblib.load("scaler_embed_sarcasm.pkl")

 
#result_new = pca_reload.transform(X)

In [None]:
std_ril = standar_reload.transform(bert_emb_ril)
std_ghosh = standar_reload.transform(bert_emb_ghosh)

In [None]:
reduced_ril = pca_reload.transform(std_ril)
reduced_ghosh = pca_reload.transform(std_ghosh)

In [None]:
test_features_ril['bert_embed'] = reduced_ril
test_features_ghosh['bert_embed'] = reduced_ghosh

In [None]:
with open('../data/riloff_test_sarc.p', 'wb') as fp:
    pickle.dump(test_features_ril, fp, protocol=pickle.HIGHEST_PROTOCOL)
with open('../data/ghosh_test_sarc.p', 'wb') as fp:
    pickle.dump(test_features_ghosh, fp, protocol=pickle.HIGHEST_PROTOCOL)