# Bibliotecas e leitura dos dados

In [None]:
import preprocessing as pp
import load_files as lf
import nlp_algorithms as nlp

import pandas as pd
import numpy as np
import time

from sentence_transformers import SentenceTransformer

In [None]:
pd.set_option('display.max_colwidth', -1)

train_v1 = pd.read_csv('dados/train v0.1.csv')

train_v1_enen = train_v1[(train_v1['url1_lang'] == 'en') & (train_v1['url2_lang'] == 'en')]

train_v1_enen.head()

In [None]:
test = pd.read_csv('dados/final_evaluation_data.csv')

test_enen = test[(test['url1_lang'] == 'en') & (test['url2_lang'] == 'en')]

test_enen.head()

In [None]:
train_v1_enen.shape

In [None]:
test_enen.shape

## Leitura das URLs baixadas para cada dataset (train e test)

In [None]:
data_path = 'dados/train v0.1/'

lista_docs = []
lista_error = []
lista_vazio = []
values = train_v1_enen[['pair_id', 'Overall']]

for index, values in values.iterrows():
    
    try:
        
        json_pair = lf.get_json_document_pair(data_path, values['pair_id'])
        text_doc1 = json_pair[0]['text']
        text_doc2 = json_pair[1]['text']
        
        if ( len(text_doc1) > 0 ) and ( len(text_doc2) > 0 ):
            lista_docs.append((values['pair_id'], text_doc1, text_doc2, values['Overall']))
        else:
            lista_vazio.append(values['pair_id'])
    
    except:
        lista_error.append(values['pair_id'])

In [None]:
df_text = pd.DataFrame(lista_docs,  columns=['pair_id', 'doc1', 'doc2', 'Overall'])

In [None]:
df_text.shape

In [None]:
data_path = 'dados/output_dir_test_enen/'

lista_docs = []
lista_error = []
lista_vazio = []
values = test_enen[['pair_id', 'Overall']]

for index, values in values.iterrows():
    
    try:
        
        json_pair = lf.get_json_document_pair(data_path, values['pair_id'])
        text_doc1 = json_pair[0]['text']
        text_doc2 = json_pair[1]['text']
        
        if ( len(text_doc1) > 0 ) and ( len(text_doc2) > 0 ):
            lista_docs.append((values['pair_id'], text_doc1, text_doc2, values['Overall']))
        else:
            lista_vazio.append(values['pair_id'])
    
    except:
        lista_error.append(values['pair_id'])

In [None]:
df_text_test = pd.DataFrame(lista_docs,  columns=['pair_id', 'doc1', 'doc2', 'Overall'])

In [None]:
df_text_test.shape

## Pre-processamento

In [None]:
'''pre_processing_list = [
    {"tokenization": True},
    {"no_url": True, "basic_processing": True, "tokenization": True}
    ]'''

pre_processing_list = [
    {"no_url": True, "basic_processing": True, "tokenization": True}
    ]

df_pp = pp.pre_process_all(df_text, pre_processing_list)
df_pp_test = pp.pre_process_all(df_text_test, pre_processing_list)

# Algoritmos

In [None]:
df_results = df_pp_test[["pair_id", "Overall"]]
df_results.head(1)

## BERT

In [None]:
pre_processing_list

In [None]:
#model_list = ['all-mpnet-base-v2', 'multi-qa-mpnet-base-dot-v1', 'all-distilroberta-v1', 'all-MiniLM-L12-v2', 'multi-qa-distilbert-cos-v1']

model_list = ['all-MiniLM-L6-v2', 'multi-qa-MiniLM-L6-cos-v1', 'paraphrase-multilingual-mpnet-base-v2', 'paraphrase-albert-small-v2', 'paraphrase-multilingual-MiniLM-L12-v2', 'paraphrase-MiniLM-L3-v2', 'distiluse-base-multilingual-cased-v1', 'distiluse-base-multilingual-cased-v2']

for model_name in model_list:
    model = SentenceTransformer('sentence-transformers/'+model_name)

    (df_bert, time_bert) = nlp.apply_bert(df_pp, df_pp_test, len(pre_processing_list), model, model_name, fine_tune=False)
    df_results = df_results.join(df_bert)
    df_bert.to_csv('./resultados/'+model_name+'.csv', index = False)
    time_bert.to_csv('./resultados/'+model_name+'_time.csv', index = False)
    
df_results.corr()

In [None]:
#model_name = 'all-mpnet-base-v2'

model_name = 'multi-qa-mpnet-base-dot-v1'
#model_name = 'all-distilroberta-v1'
#model_name = 'all-MiniLM-L12-v2'
#model_name = 'multi-qa-distilbert-cos-v1'
#model_name = 'all-MiniLM-L6-v2'

model = SentenceTransformer('sentence-transformers/'+model_name)

(df_bert, time_bert) = nlp.apply_bert(df_pp, df_pp_test, len(pre_processing_list), model, model_name, fine_tune=True)

df_results = df_results.join(df_bert)


In [None]:
df_bert.to_csv('./resultados/'+model_name+'.csv', index = False)
time_bert.to_csv('./resultados/'+model_name+'_time.csv', index = False)

In [None]:
df_results.corr()

## USE (Universal Sentence Encoder)

In [None]:
pre_processing_list = [
    {"no_url": True, "basic_processing": True}
    ]

df_pp = pp.pre_process_all(df_text, pre_processing_list)
df_pp_test = pp.pre_process_all(df_text_test, pre_processing_list)

In [None]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
import re
import math
import scipy

#DAN encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

#Transformers based encoder 
#module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"



model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

In [None]:
doc1_name = 'doc1'
doc2_name = 'doc2'

sts_data = df_pp_test[['Overall', doc1_name, doc2_name]]#sts_dev

def run_sts_benchmark(batch):
  start_time = time.time()
  sts_encode1 = tf.nn.l2_normalize(embed(tf.constant(batch[doc1_name].tolist())), axis=1)
  sts_encode2 = tf.nn.l2_normalize(embed(tf.constant(batch[doc2_name].tolist())), axis=1)

  cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
  scores = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
  #clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
  #scores = 1.0 - tf.acos(clip_cosine_similarities) / math.pi
  """Returns the similarity scores"""
  runtime = time.time()-start_time
  return (scores, runtime)

dev_scores = sts_data['Overall'].tolist()
scores = []
#for batch in np.array_split(sts_data, 10):
#  (df_use, time_use) = run_sts_benchmark(batch)
#  scores.extend(df_use)

(df_use, time_use) = run_sts_benchmark(sts_data)
scores = df_use

pearson_correlation = scipy.stats.pearsonr(scores, dev_scores) #scores= predicted cos_sim, dev_scores = Overall
print('Pearson correlation coefficient = {0}\np-value = {1}'.format(
    pearson_correlation[0], pearson_correlation[1]))

In [None]:
df_use = pd.DataFrame(df_use.numpy()).rename(columns={0: "USE"})

In [None]:
df_results = df_results.join(df_use)

In [None]:
df_results.corr()

In [None]:
time_use = pd.DataFrame({"time_USE": [time_use,0]})
#pd.DataFrame(index={time_use: 0})

In [None]:
df_use.to_csv('./resultados/USE.csv', index = False)
time_use.to_csv('./resultados/USE_time.csv', index = False)

In [None]:
len(scores)

## tf-idf

In [None]:
#df_results = df_pp_test[["pair_id", "Overall"]]
#df_results.head(1)

In [None]:
pre_processing_list = [
    {"no_url": True, "basic_processing": True, "tokenization": True}
    ]


df_pp = pp.pre_process_all(df_text, pre_processing_list)
df_pp_test = pp.pre_process_all(df_text_test, pre_processing_list)

(df_tf_idf, time_tf_idf) = nlp.apply_tf_idf(df_pp_test, len(pre_processing_list))

In [None]:
df_tf_idf.to_csv('./resultados/tf_idf.csv', index = False)
time_tf_idf.to_csv('./resultados/tf_idf_time.csv', index = False)

In [None]:
df_results = df_results.join(df_tf_idf)

In [None]:
df_results.corr()

## Salvando resultados da correlação e tempo

In [None]:
lista_metodos = ['all-mpnet-base-v2', 'multi-qa-mpnet-base-dot-v1', 'all-distilroberta-v1', 'all-MiniLM-L12-v2', 'multi-qa-distilbert-cos-v1', 'all-MiniLM-L6-v2', 'multi-qa-MiniLM-L6-cos-v1', 'paraphrase-multilingual-mpnet-base-v2', 'paraphrase-albert-small-v2', 'paraphrase-multilingual-MiniLM-L12-v2', 'paraphrase-MiniLM-L3-v2', 'distiluse-base-multilingual-cased-v1', 'distiluse-base-multilingual-cased-v2', 'tf_idf', 'USE']
df_results = df_pp_test[["pair_id", "Overall"]]

for metodo in lista_metodos:
    df_metodo = pd.read_csv('resultados/'+metodo+'.csv')
    df_results = df_results.join(df_metodo)

df_results.corr()


In [None]:
resultados_padrao = df_results.corr()['Overall']#.sort_values()

resultados_sort = df_results.corr()['Overall'].sort_values()

resultados_padrao.to_csv("resultados/corr_padrao.csv")
resultados_sort.to_csv("resultados/corr_sort.csv")

In [None]:
lista_metodos = ['all-mpnet-base-v2', 'multi-qa-mpnet-base-dot-v1', 'all-distilroberta-v1', 'all-MiniLM-L12-v2', 'multi-qa-distilbert-cos-v1', 'all-MiniLM-L6-v2', 'multi-qa-MiniLM-L6-cos-v1', 'paraphrase-multilingual-mpnet-base-v2', 'paraphrase-albert-small-v2', 'paraphrase-multilingual-MiniLM-L12-v2', 'paraphrase-MiniLM-L3-v2', 'distiluse-base-multilingual-cased-v1', 'distiluse-base-multilingual-cased-v2', 'tf_idf', 'USE']
df_time = pd.read_csv('resultados/'+lista_metodos[0]+'_time.csv')

for metodo in lista_metodos[1:]:
    df_metodo = pd.read_csv('resultados/'+metodo+'_time.csv')
    df_time = df_time.append(df_metodo)

df_time


In [None]:
df_time.to_csv('resultados/time_padrao.csv')

df_time.sort_values(by=['1']).to_csv('resultados/time_sort.csv')