In [2]:
import pandas as pd
import os
#cerrega todo os arquivos
dfsNew = []
dfsOld = []

for arq in os.listdir("./data/ENG/"):
    if arq.find("New") > 0:
        dfsNew.append(pd.read_csv("./data/ENG/"+arq, sep="\t"))
    elif arq.find("Old") > 0:
        dfsOld.append(pd.read_csv("./data/ENG/"+arq, sep="\t"))


In [4]:
from nltk import ngrams
from sacremoses import MosesTokenizer

def tokenize(text, lang, nGram=3):
    entok = MosesTokenizer(lang=lang)
    text = entok.tokenize(text, escape=False)
    grams = []
    for i in range(1, nGram):
        i_grams = [
            " ".join(gram)
            for gram in ngrams(text, i)
        ]
        grams.extend(i_grams)
        
    return grams

def getNgramOverlap(hypothesys, references, nGram, lang):

  overlaps = []
  for h, r in zip([hypothesys], [references]):
    if (h == "") or (r == ""):
      overlaps.append(1.0)
      continue
    a = tokenize(h, lang, nGram)
    b = tokenize(r, lang, nGram)

    if len(a) >= len(b):
      overlaps.append(len(set(a) & set(b))/len(a))
    elif len(b) >= len(a):
      overlaps.append(len(set(a) & set(b))/len(b))

  return overlaps[0]

import numpy as np

def getStats(dfA, dfB, lang):
    df = dfA.merge(dfB, on=["livro", "capitulo", "versiculo"])

    df["sourceLen"] = df["texto_x"].apply(lambda x: getSizeSentece(x))
    df["targetLen"] = df["texto_y"].apply(lambda x: getSizeSentece(x))
    
    df["overlap"] = df.apply(lambda x: getNgramOverlap(x["texto_x"], x["texto_y"], 3, lang), axis=1)

    return df
#Matriz do novo testamento

def getSizeSentece(text):
    try:
        return len(text.split(" "))
    except:
        return 0
    


In [5]:
print("Novo testamento: ")
newConcat = []
for dfA in dfsNew:
    dfsNew = dfsNew[1:]
    for dfB in dfsNew:
        newConcat.append(getStats(dfA, dfB, "en"))

Novo testamento: 


In [6]:
dfConcat = pd.concat(newConcat, ignore_index=False)
print("tamanho sem cortes: ", dfConcat.shape)
#remove versiculos deseconhecidos
dfConcat = dfConcat[dfConcat["versiculo"] <= 180]
#remove textos muito difenretes
dfConcat = dfConcat[dfConcat["overlap"] > 0.02]
#filtra sentencas curtas
dfConcat = dfConcat[dfConcat["sourceLen"] > 5]
dfConcat = dfConcat[dfConcat["targetLen"] > 5]

print("tamanho com cortes: ", dfConcat.shape)

tamanho sem cortes:  (3127932, 10)
tamanho com cortes:  (3040910, 10)


In [7]:
dfConcat.to_csv("./data/ENG/NovoTestamentoCompleto-ENG.tsv", sep="\t", index=False)

In [8]:
print("Velho testamento: ")
oldConcat = []
for dfA in dfsOld:
    dfsOld = dfsOld[1:]
    for dfB in dfsOld:
        oldConcat.append(getStats(dfA, dfB, "en"))

Velho testamento: 


In [9]:
dfConcat = pd.concat(oldConcat, ignore_index=False)
print("tamanho sem cortes: ", dfConcat.shape)
#remove versiculos deseconhecidos
dfConcat = dfConcat[dfConcat["versiculo"] <= 180]
#remove textos muito difenretes
dfConcat = dfConcat[dfConcat["overlap"] > 0.02]
#filtra sentencas curtas
dfConcat = dfConcat[dfConcat["sourceLen"] > 5]
dfConcat = dfConcat[dfConcat["targetLen"] > 5]

print("tamanho com cortes: ", dfConcat.shape)

tamanho sem cortes:  (10246620, 10)
tamanho com cortes:  (8841448, 10)


In [10]:
dfConcat.to_csv("./data/ENG/VelhoTestamentoCompleto-ENG.tsv", sep="\t", index=False)