In [1]:
import pandas as pd
import os
#cerrega todo os arquivos
dfsNew = []
dfsOld = []

for arq in os.listdir("./data/PT/"):
    if arq.find("New") > 0:
        dfsNew.append(pd.read_csv("./data/PT/"+arq, sep="\t", index_col=0))
    elif arq.find("Old") > 0:
        dfsOld.append(pd.read_csv("./data/PT/"+arq, sep="\t", index_col=0))



In [2]:
from nltk import ngrams
from sacremoses import MosesTokenizer

def tokenize(text, lang, nGram=3):
    entok = MosesTokenizer(lang=lang)
    text = entok.tokenize(text, escape=False)
    grams = []
    for i in range(1, nGram):
        i_grams = [
            " ".join(gram)
            for gram in ngrams(text, i)
        ]
        grams.extend(i_grams)
        
    return grams

def getNgramOverlap(hypothesys, references, nGram, lang):

  overlaps = []
  for h, r in zip([hypothesys], [references]):
    if (h == "") or (r == ""):
      overlaps.append(1.0)
      continue
    a = tokenize(h, lang, nGram)
    b = tokenize(r, lang, nGram)

    if len(a) >= len(b):
      overlaps.append(len(set(a) & set(b))/len(a))
    elif len(b) >= len(a):
      overlaps.append(len(set(a) & set(b))/len(b))

  return overlaps[0]

import numpy as np

def getStats(dfA, dfB, lang):
    df = dfA.merge(dfB, on=["livro", "capitulo", "versiculo"])

    df["sourceLen"] = df["texto_x"].apply(lambda x: getSizeSentece(x))
    df["targetLen"] = df["texto_y"].apply(lambda x: getSizeSentece(x))
    
    df["overlap"] = df.apply(lambda x: getNgramOverlap(x["texto_x"], x["texto_y"], 3, lang), axis=1)

    return df
#Matriz do novo testamento

def getSizeSentece(text):
    try:
        return len(text.split(" "))
    except:
        return 0
    


In [3]:
len(dfsOld)

5

In [4]:
print("Novo testamento: ")
import copy
concat_df_new = []
new = copy.deepcopy(dfsNew)
for dfA in new:
    new = new[1:]
    for dfB in new:
             
        print(set(dfA["livro"]) - set(dfB["livro"]))
        concat_df_new.append(getStats(dfA, dfB, "pt"))


Novo testamento: 
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()


In [5]:
dfConcat_novo = pd.concat(concat_df_new, ignore_index=False)
print("tamanho sem cortes: ", dfConcat_novo.shape)

dfConcat_novo["VERSAO"] = "NOVO"

tamanho sem cortes:  (115364, 10)


In [6]:
print("Velho testamento: ")
import copy
concat_df_old = []
old = copy.deepcopy(dfsOld)
for dfA in old:
    old = old[1:]
    for dfB in old:
             
        print(set(dfA["livro"]) - set(dfB["livro"]))
        concat_df_old.append(getStats(dfA, dfB, "pt"))

Velho testamento: 
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()


In [7]:
dfConcat_old = pd.concat(concat_df_old, ignore_index=False)
print("tamanho sem cortes: ", dfConcat_old.shape)
dfConcat_old["VERSAO"] = "VELHO"

tamanho sem cortes:  (223546, 10)


In [8]:
df_geral = pd.concat([dfConcat_old,dfConcat_novo])

print(df_geral["sourceLen"].mean())
print(df_geral["targetLen"].mean())
print(df_geral["overlap"].mean())

27.371939452952113
27.29075860848013
0.2873294570261173


In [9]:
df_geral.dropna(inplace=True)
df_geral = df_geral[(df_geral.sourceLen >= 5) & (df_geral.targetLen >= 5) ]

In [10]:
selRows = df_geral[(df_geral["livro"] == "Mateus") & (df_geral["capitulo"] == 1)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 16)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Lucas") & (df_geral["capitulo"] == 3)  & (df_geral["versiculo"] >= 23) & (df_geral["versiculo"] <= 38)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Gênesis") & (df_geral["capitulo"] == 5)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 32)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Gênesis") & (df_geral["capitulo"] == 10)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 32)].index
df_geral.drop(selRows, axis=0,inplace=True)


filter = df_geral['livro'].str.contains("1%20Crônicas")
df_geral = df_geral[~filter]
filter = df_geral['livro'].str.contains("2%20Crônicas")
df_geral = df_geral[~filter]


filter = df_geral['texto_x'].str.contains("Copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_x'].str.contains("copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_x'].str.contains("®")
df_geral = df_geral[~filter]

filter = df_geral['texto_y'].str.contains("Copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_y'].str.contains("copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_y'].str.contains("®")
df_geral = df_geral[~filter]



print(df_geral["sourceLen"].mean())
print(df_geral["targetLen"].mean())
print(df_geral["overlap"].mean())

27.368023053743702
27.1239104980243
0.2879330615451806


In [16]:
df_geral.value_counts(["estilo_x"])

estilo_x
ARC         112612
NTLH         84201
NVI-PT       59471
NVT          33027
OL            7040
Name: count, dtype: int64

In [15]:
df_geral.drop_duplicates(["texto_x"]).value_counts(["estilo_x"])

estilo_x
ARC         26921
NVI-PT      26483
NTLH        26032
NVT         25519
OL           6995
Name: count, dtype: int64

In [52]:
df_geral.to_csv("./data/FILTERED/portuguese.tsv", sep="\t")