In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import tensorflow as tf 
import re
import textattack as ta 
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter

In [None]:
data = pd.read_csv('D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/data/final_training_semeval.csv')

In [None]:
data['text'] = data['text'].str.replace(r'#([^\s:]+)', '')

In [None]:
data['Findall(name)']= data["text"].str.findall('(#\w*)') 

In [None]:
data[data['Findall(name)'].apply(lambda x: len(x)) == 0]

In [None]:
def remove_link(x):
    text = re.sub(r'^https?:\/\/.[\r\n]', '', x, flags=re.MULTILINE)
    return text

#removes other link 
def remove_link2(x):
    text = re.sub(r'http\S+', '', x)
    return text    
data['text'] = data['text'].apply(remove_link)
data['text'] = data['text'].apply(remove_link2)
data['text']  = data['text'].replace('\s+', ' ', regex=True)
data['list'] = data.text.apply(lambda x: x.split(' '))
data['len_list'] = data.list.str.len()
data = data[data.len_list > 4]

In [None]:
# Import pre-build vocabulary 
from deepmoji.global_variables import get_vocabulary
from deepmoji.sentence_tokenizer import SentenceTokenizer

st = SentenceTokenizer(get_vocabulary(), 50)
test_sentences = data.text
tokens, infos, stats = st.tokenize_sentences(test_sentences)

# print(tokens)
# print(infos)
# print(stats)

### Extract embeddings

In [None]:
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, \
    get_vocabulary
from deepmoji.model_def import deepmoji_feature_encoding
from deepmoji.sentence_tokenizer import SentenceTokenizer

TEST_SENTENCES =  data.text.to_list()

maxlen = 20
batch_size = 16

# print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
st = SentenceTokenizer(get_vocabulary(), maxlen, ignore_sentences_with_only_custom=False)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
path = "D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/Code/DeepMoji/model/weights/deepmoji-checkpoint-cd2cb10d-83c1-438f-aa12-88a2eab3cdc1.hdf5"
# print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH) #PRETRAINED_PATH
model.summary()

# print('Encoding texts..')
encoding = model.predict(tokenized)

# print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
# print(encoding[0, :5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.

In [None]:
np.save(r'D:\Data_Science_all\MSC_2_anno\Tesi_Irony_Sarcasm\Code\Deep_moji_feature\train\irony\sentence_emoji_train_semeval', encoding)

In [None]:
np.save(r'D:\Data_Science_all\MSC_2_anno\Tesi_Irony_Sarcasm\Code\Deep_moji_feature\train\irony\y_emoji_train_semeval', np.array(data.label))

### Fine tuning

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tokens,data.label, test_size=0.05, stratify = data.label,  random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, test_size=0.05, random_state=1) 

In [None]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

In [None]:
  
"""Finetuning example.
Trains the DeepMoji model on the kaggle insults dataset, using the 'chain-thaw'
finetuning method and the accuracy metric. See the blog post at
https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0
for more information. Note that results may differ a bit due to slight
changes in preprocessing and train/val/test split.
The 'chain-thaw' method does the following:
0) Load all weights except for the softmax layer. Extend the embedding layer if
   necessary, initialising the new weights with random values.
1) Freeze every layer except the last (softmax) layer and train it.
2) Freeze every layer except the first layer and train it.
3) Freeze every layer except the second etc., until the second last layer.
4) Unfreeze all layers and train entire model.
"""

from deepmoji.finetuning import (
    load_benchmark,
    finetune,sampling_generator)
from deepmoji.global_variables import PRETRAINED_PATH, get_vocabulary
from deepmoji.model_def import deepmoji_transfer

In [None]:
nb_classes = 2
maxlen = 20
# Set up model and finetune. Note that we have to extend the embedding layer
# with the number of tokens added to the vocabulary.
model = deepmoji_transfer(nb_classes, maxlen, PRETRAINED_PATH, embed_dropout_rate=0.8, final_dropout_rate=0.7)
model.summary()

In [None]:
model, hist = finetune(model, [X_train, X_val, X_test], [y_train, y_val, y_test], nb_classes,
                          nb_epochs = 3,  method='chain-thaw', epoch_size = X_train.shape[0], batch_size = 24)
print('Acc: {}'.format(acc))


### Predictions

In [None]:
plt.rcParams["font.weight"] = "bold"
plt.rcParams["axes.labelweight"] = "bold"
plt.figure(figsize = (5,5))
plt.grid(linestyle = 'dashed')
plt.plot(n_epochs, accuracy_train, c = 'green')
plt.plot(n_epochs, accuracy_val)

plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.2f}'))
plt.gca().xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.title('Accuracy Metric with respect to DeepMoji Model')
plt.legend(['Train Set', 'Validation Set'])
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.savefig(r'D:\Data_Science_all\MSC_2_anno\Tesi_Irony_Sarcasm\thesis_latex\img\accuracy_deepmoji_irony_semeaval.png', dpi=500)

In [None]:

plt.rcParams["font.weight"] = "bold"
plt.rcParams["axes.labelweight"] = "bold"
plt.figure(figsize = (5,5))
plt.grid(linestyle = 'dashed')
plt.plot(n_epochs, loss_train, c = 'green')
plt.plot(n_epochs, loss_val)
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.2f}'))
plt.gca().xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.title('Cross Entropy Loss with respect to DeepMoji Model')
plt.legend(['Train Set', 'Validation Set'])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.savefig(r'D:\Data_Science_all\MSC_2_anno\Tesi_Irony_Sarcasm\thesis_latex\img\loss_deepmoji_irony_semeval.png', dpi=500)

In [None]:
from deepmoji.model_def import deepmoji_architecture
final_model = deepmoji_architecture(2,20)

In [None]:
final_model.load_weights("D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/Code/DeepMoji/model/weights/deepmoji-checkpoint-0afd84bb-7905-46c4-9f52-0e13ad6cb91e.hdf5")

In [None]:
sem = pd.read_csv('D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/data/SemEval2018-Task3/datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt', sep='\t')
sem.rename({'Tweet text': 'text', 'Label' : 'label'}, axis = 1, inplace=True)

In [None]:
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, \
    get_vocabulary
from deepmoji.model_def import deepmoji_feature_encoding
from deepmoji.sentence_tokenizer import SentenceTokenizer

TEST_SENTENCES = sem.text.to_list()
maxlen = 20
batch_size = 16

# print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
st = SentenceTokenizer(get_vocabulary(), maxlen, ignore_sentences_with_only_custom=False)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

# print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)

# print('Encoding texts..')
encoding = model.predict(tokenized)

In [None]:
np.save(r'D:\Data_Science_all\MSC_2_anno\Tesi_Irony_Sarcasm\Code\Deep_moji_feature\test\sentence_emoji_sem', encoding)
np.save(r'D:\Data_Science_all\MSC_2_anno\Tesi_Irony_Sarcasm\Code\Deep_moji_feature\test\y_emoji_sem', np.array(sem.label))

In [None]:
st = SentenceTokenizer(get_vocabulary(), maxlen, ignore_sentences_with_only_custom=False)
test_sentences = sem.text.to_numpy()

tokens, infos, stats = st.tokenize_sentences(test_sentences)

In [None]:
import numpy as np
y_test = sem.label.values

In [None]:
tokens.shape

In [None]:
import tensorflow as tf
pred = final_model.predict(tokens)

In [None]:
from sklearn.metrics import classification_report, f1_score
print('DeepMoji')
print(classification_report(pred.argmax(axis = -1),y_test))

In [None]:
f1_score(pred.argmax(axis = -1),y_test)

In [None]:
def normalize_lab(x):
    new_lab_list = []
    for i in x:
        if i == 0:
            new_lab = '1:1'
        else:
            new_lab =  '2:0'
            
        new_lab_list.append(new_lab)
        
    return new_lab_list

In [None]:
def normalize_lab(x):
    new_lab_list = []
    for i in x:
        if i == 0:
            new_lab = '1:1'
        else:
            new_lab =  '2:0'
            
        new_lab_list.append(new_lab)
        
    return new_lab_list

def mark_error(actual, predicted):
    mark_list = []
    for i,j in zip(actual, predicted):
        if i != j:
            mark = '+'
        else:
            mark = np.nan
            
        mark_list.append(mark)
        
    return mark_list

def get_proba_distrib(clf_proba):
    proba_ast = []
    for i,j in zip(clf_proba[:,0], clf_proba[:,1]):
        if i > j:
            proba = ['*{}'.format(str(i.round(5))), str(j.round(5))]
        else:
            proba = [str(i.round(5)), '*{}'.format(str(j.round(5)))]
            
        proba_ast.append(proba)
        
    return np.array(proba_ast)

In [None]:
def get_proba_distrib(clf_proba):
    proba_ast = []
    for i,j in zip(clf_proba[:,0], clf_proba[:,1]):
        if i > j:
            proba = ['*{}'.format(str(i.round(5))), str(j.round(5))]
        else:
            proba = [str(i.round(5)), '*{}'.format(str(j.round(5)))]
            
        proba_ast.append(proba)
        
    return np.array(proba_ast)

def get_outpupt_bma(clf, x, ground_truth):
    
    pred = clf.predict(x).argmax(axis = -1)
    
    actual = normalize_lab(ground_truth)
    predicted = normalize_lab(pred)
    
    error = mark_error(actual, predicted)
    conta = 0
    lista_ins = []
    for i in range(len(x)):
        conta += 1
        if conta == int(len(x)/10) + 2:
            conta = 1
     
        lista_ins.append(conta)  
        
    instanc = lista_ins
    
    predict_proba = clf.predict(x)
    
    distribution = get_proba_distrib(predict_proba)
    
    final_df = pd.DataFrame(instanc, columns=['inst#'])
    
    final_df['actual'] = actual
    
    final_df['predicted'] = predicted
    
    final_df['error'] = error

    final_df['distribution'] = distribution[:, 0]
    
    final_df[''] = distribution[:,1]
    
    return final_df

In [None]:
df = get_outpupt_bma(final_model, tokens, sem.label)

In [None]:
df.to_csv('D:/Data_Science_all/MSC_2_anno/Tesi_Irony_Sarcasm/Code/BMA/results_semeval/input/prediction_file/deepmoji_chain.csv', index = False)