# Text preprocessing - Pipeline

This code will preprocess a pair of documents in order to analyse their similarity afterwards.

Import libraries

In [None]:
import pandas as pd
import numpy as np
import json
import nltk
#nltk.download()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



Read files

In [None]:
pd.set_option('display.max_colwidth', -1)
## dfSourceV2 = pd.read_csv('../Dados/v2_semeval-2022_task8_train-data_batch.csv')
trainv1 = pd.read_csv('dados/train v0.1.csv')

trainv1_enen = trainv1[(trainv1['url1_lang']=='en') & (trainv1['url2_lang']=='en')]

trainv1_enen.head()

### Carregar noticias a partir dos ids:

Functions that read json documents

In [None]:
#Function reads a json file
def readJsonFile(path):
    f = open(path)
    data = json.load(f)
    f.close()
    return data

    
#Function returns a dataframe with the text of the pairs
def getJsonDocumentPair(dataPath, pairId):
    listIds = pairId.split('_')
    doc1Id = listIds[0]
    doc2Id = listIds[1]

    doc1Path = dataPath + doc1Id[-2:] + '/' + doc1Id + '.json' 
    doc2Path = dataPath + doc2Id[-2:] + '/' + doc2Id + '.json' 

    doc1Json = readJsonFile(doc1Path)
    doc2Json = readJsonFile(doc2Path)

    return (doc1Json, doc2Json)

Getting text to preprocess

In [None]:
dataPath = 'dados/train v0.1/'
#jsonPair = getJsonDocumentPair(dataPath,'1484084337_1484110209')
#textDoc1 = jsonPair[0]['text']

lista_docs = []
lista_error = []
lista_vazio = []
values = trainv1_enen[['pair_id', 'Overall']]

for index, values in values.iterrows():
    try:
        jsonPair = getJsonDocumentPair(dataPath, values['pair_id'])
        textDoc1 = jsonPair[0]['text']
        textDoc2 = jsonPair[1]['text']
        if len(textDoc1)>0 and len(textDoc2)>0:
            lista_docs.append((values['pair_id'], textDoc1, textDoc2, values['Overall']))
        else:
            lista_vazio.append(values['pair_id'])
    except:
        lista_error.append(values['pair_id'])

#Creating DF to text
#dfText = pd.DataFrame([[textDoc1]], columns=['original_text'])
dfText = pd.DataFrame(lista_docs,  columns=['pair_id', 'doc1', 'doc2', 'Overall'])
dfText.head(1)

In [None]:
dfText.shape


In [None]:
trainv1_enen.shape

In [None]:
len(lista_error)

In [None]:
len(lista_vazio)

# Inicio do pre-processamento

Removing punctuation

In [None]:
import string
string.punctuation
other_punctuation = '—“”'  

In [None]:
#Function that removes punctuation 
def removePunctuation(text):
    punctuationFreeDoc = "".join([i for i in text if i not in string.punctuation+other_punctuation])
    return punctuationFreeDoc


#Storing the puntuation free text
dfText['clean_msg1']= dfText['doc1'].apply(lambda x:removePunctuation(x))
dfText['clean_msg2']= dfText['doc2'].apply(lambda x:removePunctuation(x))
dfText.head(1)

Transforming to lowercase

In [None]:
dfText['msg_lower1']= dfText['clean_msg1'].apply(lambda x: x.lower())
dfText['msg_lower2']= dfText['clean_msg2'].apply(lambda x: x.lower())
dfText.head(1)

Tokenization

In [None]:
dfText['msg_tokenized1']= dfText['msg_lower1'].apply(lambda x: nltk.word_tokenize(x))
dfText['msg_tokenized2']= dfText['msg_lower2'].apply(lambda x: nltk.word_tokenize(x))
dfText.head(1)

Talvez fazer o sentence tokenizer:

Stopwords filtering

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.append('’')
print(stop_words)

In [None]:
def removeStopwords(listWords):
    filteredWords = [word for word in listWords if word not in stop_words]
    return filteredWords

dfText['no_stopwords1']= dfText['msg_tokenized1'].apply(lambda x: removeStopwords(x))
dfText['no_stopwords2']= dfText['msg_tokenized2'].apply(lambda x: removeStopwords(x))
dfText[['msg_tokenized1', 'no_stopwords1', 'msg_tokenized2', 'no_stopwords2']].head(1)


Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def stemming(listWords):
    stemText = [porter.stem(word) for word in listWords]
    return stemText

dfText['msg_stemmed1']= dfText['no_stopwords1'].apply(lambda x: stemming(x))
dfText['msg_stemmed2']= dfText['no_stopwords2'].apply(lambda x: stemming(x))
dfText[['no_stopwords1', 'msg_stemmed1', 'no_stopwords2', 'msg_stemmed2']].head(1)

Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(listWords):
    lemmText = [wordnet_lemmatizer.lemmatize(word) for word in listWords]
    return lemmText

dfText['msg_lemmatized1']= dfText['no_stopwords1'].apply(lambda x: lemmatizer(x))
dfText['msg_lemmatized2']= dfText['no_stopwords2'].apply(lambda x: lemmatizer(x))
dfText[['no_stopwords1', 'msg_lemmatized1']].head(1)

# Implementação dos algoritmos

In [None]:
def join_unique_docs(dfText, column1, column2):

   data = []
   for i in range(0, len(dfText)):
      data.append(' '.join(dfText[column1][i]))
      data.append(' '.join(dfText[column2][i]))

   data = list(set(data))
   
   return data

In [None]:
def join_docs(dfText, column1, column2):

   data = []
   for i in range(0, len(dfText)):
      data.append(' '.join(dfText[column1][i]))
      data.append(' '.join(dfText[column2][i]))
   
   return data

### Jaccard

In [None]:
def calculate_jaccard(word_tokens1, word_tokens2):
	# Combine both tokens to find union.
	both_tokens = word_tokens1 + word_tokens2
	union = set(both_tokens)

	# Calculate intersection.
	intersection = set()
	for w in word_tokens1:
		if w in word_tokens2:
			intersection.add(w)

	if len(union) == 0:
		jaccard_score = 0
	else:
		jaccard_score = len(intersection)/len(union)
	return jaccard_score

In [None]:
dfText['jaccard'] = dfText.apply(lambda row: calculate_jaccard(row['msg_lemmatized1'], row['msg_lemmatized2']), axis=1)
#calculate_jaccard(dfText['msg_lemmatized1'][0], dfText['msg_lemmatized2'][0])

In [None]:
dfText[['jaccard', 'Overall']]

In [None]:
dfText[['jaccard', 'Overall']].corr()

### BoW (CountVectorizer)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def get_bow(doc1, doc2):
    
    vectorizer = CountVectorizer()
    
    text_list1 = ' '.join(doc1)
    text_list2 = ' '.join(doc2)
    
    text_list = [text_list1, text_list2]

    vector = vectorizer.fit_transform(text_list)
    
    cosine_similarities = cosine_similarity(vector[0], vector[1])#.flatten()
    
    return cosine_similarities[0][0]


In [None]:
dfText['bow'] = dfText.apply(lambda row: get_bow(row['no_stopwords1'], row['no_stopwords2']), axis=1)

In [None]:
dfText[['bow', 'Overall']]

In [None]:
dfText[['bow', 'Overall']].corr()

### TF-IDF

In [None]:
def calculate_tfidf(dfText, column1, column2):
    
    #data = join_unique_docs(dfText, column1, column2)
    data = join_docs(dfText, column1, column2)
      
    tfidf = TfidfVectorizer().fit_transform(data)
    
    return tfidf

In [None]:
def get_tfidf(tfidf, index):
    
    index1 = 2*index
    index2 = 2*index + 1
    
    cosine_similarities = cosine_similarity(tfidf[index1], tfidf[index2])#.flatten()
    
    return cosine_similarities[0][0]

In [None]:
column1 = 'no_stopwords1'
column2 = 'no_stopwords2'

tfidf = calculate_tfidf(dfText, column1, column2)

In [None]:
#dfText['tfidf'] = dfText.apply(lambda row: get_tfidf(tfidf, row.index), axis=1)

tfidf_list = []
for i in range(len(dfText)):
    tfidf_list.append(get_tfidf(tfidf, i))
    
dfText['tfidf'] = tfidf_list

In [None]:
dfText[['tfidf', 'Overall']]

In [None]:
dfText[['tfidf', 'Overall']].corr()

### Doc2Vec (https://medium.com/red-buffer/doc2vec-computing-similarity-between-the-documents-47daf6c828cd)

In [None]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec
from scipy import spatial

In [None]:
# data = []
# for i in range(0,len(dfText)):
#     data.append(' '.join(dfText['msg_lemmatized1'][i]))
#     data.append(' '.join(dfText['msg_lemmatized2'][i]))

# data = list(set(data))

In [None]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)] 

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=0, epochs=80)

In [None]:
model.build_vocab(tagged_data)

In [None]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)

In [None]:
model.save("d2v.model")

In [None]:
model = Doc2Vec.load("d2v.model")

In [None]:
def doc2vec_cos(doc1, doc2):
    infer1 = model.infer_vector(doc1)
    infer2 = model.infer_vector(doc2)
    #cos_distance = spatial.distance.cosine(infer1, infer2) #pode ser >1
    cos_similarity = 1-spatial.distance.cosine(infer1, infer2) #de 0 a 1
    return cos_similarity


In [None]:
dfText['doc2vec'] = dfText.apply(lambda row: doc2vec_cos(row['msg_tokenized1'], row['msg_tokenized2']), axis=1)

In [None]:
dfText[['doc2vec', 'Overall']].head(20)

In [None]:
dfText.describe()

In [None]:
### Proximos passos: definir pipeline, aplicar o metodo para no_stopwords, msg_lemmatized e msg_stemmed.
### Para verificar qual é o melhor método podemos colocar o Overall na mesma escala (0 a 1?) e comparar utilizando
### Alguma métrica
## obs: ver tbm se esta utilizando CBOW ou...

In [None]:
# dfText[dfText['doc2vec']<0][['Overall', 'jaccard', 'doc2vec', 'pair_id']]
#dfText[['Overall', 'jaccard', 'doc2vec', 'pair_id']]

In [None]:
dfText.hist('Overall')

Plotting scatterplot overall vs doc2vec

In [None]:
import seaborn as sns
sns.scatterplot(data=dfText, x="Overall", y="doc2vec")

### BERT

In [None]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

In [None]:
from sentence_transformers import SentenceTransformer

#Inicializando o modelo
# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
#Codificando as sentencas --> Transformando para espaço vetorial
# sentence_embeddings = model.encode(data)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def bert_cos(doc1, doc2):
    data = [doc1,doc2]
    sentence_embeddings = model.encode(data)

    infer1 = sentence_embeddings[0]
    infer2 =  sentence_embeddings[1]
    #cos_distance = spatial.distance.cosine(infer1, infer2) #pode ser >1
    cos_similarity = 1-spatial.distance.cosine(infer1, infer2) #de 0 a 1
    return cos_similarity


In [None]:
dfText['bert'] = dfText.apply(lambda row: bert_cos(" ".join(row['no_stopwords1']), " ".join(row['no_stopwords2'])), axis=1)

# dfText['bert2'] = dfText.iloc[:200].apply(lambda row: bert_cos(" ".join(row['no_stopwords1']), " ".join(row['no_stopwords2'])), axis=1)

In [None]:
dfText[['bert', 'Overall']].head(30)

In [None]:
dfText[dfText['Overall']<1.5][['bert', 'Overall']].head(20)

In [None]:
dfText[["Overall", "bert","doc2vec"]].corr()

In [None]:
sns.scatterplot(data=dfText, x="Overall", y="bert")

### Word2vec (embeddings)

In [None]:
# import gensim.downloader as api

# print(api.load("word2vec-google-news-300", return_path=True))

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
#glove_vectors = gensim.downloader.load('glove-twitter-25')
#model = KeyedVectors.load_word2vec_format('data/wiki.en.vec', binary=False)
# model = KeyedVectors.load_word2vec_format('model/word2vec-google-news-300.gz', binary=True)
model = gensim.downloader.load('word2vec-google-news-300')

In [None]:
def get_mean_vector(word2vec_model, words): #words eh um documento inteiro
    # remove out-of-vocabulary words
    words = [word for word in words if word in model.index_to_key]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []


In [None]:
get_mean_vector(model, dfText['no_stopwords1'][0])

In [None]:
def bert_cos(doc1, doc2, model):
    mean_doc1 = get_mean_vector(model, doc1)
    mean_doc2 = get_mean_vector(model, doc2)

    infer1 = mean_doc1
    infer2 =  mean_doc2
    #cos_distance = spatial.distance.cosine(infer1, infer2) #pode ser >1
    cos_similarity = 1-spatial.distance.cosine(infer1, infer2) #de 0 a 1
    return cos_similarity

In [None]:
dfText['word2vec_mean'] = dfText.apply(lambda row: bert_cos(" ".join(row['no_stopwords1']), " ".join(row['no_stopwords2']), model), axis=1)

In [None]:
for doc in corpus:
    vec = get_mean_vector(model, doc.words)
    if len(vec) > 0:
      # do somthing with the vector ${vec}

In [None]:
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords

class MyCorpus():
    def __init__(self, train_data):
        self.train_data = train_data
        
    def __iter__(self):
        p = PorterStemmer()
        for i in range(len(self.train_data)):
            doc = self.train_data['text'][i]
            doc = re.sub(r'\S*@\S*\s?', '', doc, flags=re.MULTILINE) # remove email
            doc = re.sub(r'http\S+', '', doc, flags=re.MULTILINE) # remove web addresses
            doc = re.sub("\'", "", doc) # remove single quotes
            doc = remove_stopwords(doc)
            doc = p.stem_sentence(doc)
            words = simple_preprocess(doc, deacc=True)
            yield TaggedDocument(words=words, tags=[self.train_data['documentId'][i]])

## Mapear resultado

In [None]:
'''def transformarResultado(resultado, OldMax=-1, OldMin=1, NewMax=4, NewMin=1):
    OldRange = (OldMax - OldMin)  
    NewRange = (NewMax - NewMin)  
    NewValue = (((resultado - OldMin) * NewRange) / OldRange) + NewMin
    return NewValue

    
dfText['bert_norm'] = dfText['bert'].apply(lambda x: transformarResultado(x))
dfText['bert_norm1'] = dfText['bert'].apply(lambda x: transformarResultado(x, OldMax=min(dfText['bert']), OldMin = max(dfText['bert'])))'''

In [None]:
#dfText[['bert', 'bert_norm', 'bert_norm1', 'Overall']].head(20)