In [1]:
import pandas as pd
import os
#cerrega todo os arquivos
dfsNew = []
dfsOld = []

for arq in os.listdir("./data/DE/"):
    if arq.find("New") > 0:
        dfsNew.append(pd.read_csv("./data/DE/"+arq, sep="\t"))
    elif arq.find("Old") > 0:
        dfsOld.append(pd.read_csv("./data/DE/"+arq, sep="\t"))


In [2]:
from nltk import ngrams
from sacremoses import MosesTokenizer

def tokenize(text, lang, nGram=3):
    entok = MosesTokenizer(lang=lang)
    text = entok.tokenize(text, escape=False)
    grams = []
    for i in range(1, nGram):
        i_grams = [
            " ".join(gram)
            for gram in ngrams(text, i)
        ]
        grams.extend(i_grams)
        
    return grams

def getNgramOverlap(hypothesys, references, nGram, lang):

  overlaps = []
  for h, r in zip([hypothesys], [references]):
    if (h == "") or (r == ""):
      overlaps.append(1.0)
      continue
    a = tokenize(h, lang, nGram)
    b = tokenize(r, lang, nGram)

    if len(a) >= len(b) and len(a) > 0:
      overlaps.append(len(set(a) & set(b))/len(a))
    elif len(b) >= len(a) and len(b) > 0:
      overlaps.append(len(set(a) & set(b))/len(b))
    elif len(a) == 0 or len(b) == 0:
      overlaps.append(0)

  return overlaps[0]

import numpy as np

def getStats(dfA, dfB, lang):
    df = dfA.merge(dfB, on=["livro", "capitulo", "versiculo"])

    df["sourceLen"] = df["texto_x"].apply(lambda x: getSizeSentece(x))
    df["targetLen"] = df["texto_y"].apply(lambda x: getSizeSentece(x))
    
    df["overlap"] = df.apply(lambda x: getNgramOverlap(x["texto_x"], x["texto_y"], 3, lang), axis=1)

    return df
#Matriz do novo testamento

def getSizeSentece(text):
    try:
        return len(text.split(" "))
    except:
        return 0
    


In [3]:
print("Novo testamento: ")
import copy
concat_df_new = []
new = copy.deepcopy(dfsNew)
for dfA in new:
    new = new[1:]
    for dfB in new:
             
        print(set(dfA["livro"]) - set(dfB["livro"]))
        concat_df_new.append(getStats(dfA, dfB, "de"))

Novo testamento: 
set()
set()
set()


In [4]:
len(dfsNew)

3

In [5]:
dfConcat_novo = pd.concat(concat_df_new, ignore_index=False)
print("tamanho sem cortes: ", dfConcat_novo.shape)
dfConcat_novo["VERSAO"] = "NOVO"

tamanho sem cortes:  (23177, 10)


In [6]:
print("Velho testamento: ")
import copy
concat_df_old = []
old = copy.deepcopy(dfsOld)
for dfA in old:
    old = old[1:]
    for dfB in old:
             
        print(set(dfA["livro"]) - set(dfB["livro"]))
        concat_df_old.append(getStats(dfA, dfB, "de"))

Velho testamento: 
set()
set()
set()


In [7]:
len(dfsOld)

3

In [8]:
dfConcat_old = pd.concat(concat_df_old, ignore_index=False)
print("tamanho sem cortes: ", dfConcat_old.shape)
dfConcat_old["VERSAO"] = "VELHO"

tamanho sem cortes:  (66388, 10)


In [9]:
df_geral = pd.concat([dfConcat_old,dfConcat_novo])

print(df_geral["sourceLen"].mean())
print(df_geral["targetLen"].mean())
print(df_geral["overlap"].mean())

28.787160163010103
12.948651817116062
0.09618886383430006


In [10]:
df_geral

Unnamed: 0,estilo_x,livro,capitulo,versiculo,texto_x,estilo_y,texto_y,sourceLen,targetLen,overlap,VERSAO
0,HOF,1%20Mose,1,1,Am Anfang schuf Gott Himmel und Erde .,LUTH1545,Am Anfang schuf Gott Himmel und Erde .,8,8,1.000000,VELHO
1,HOF,1%20Mose,1,11,Und Gott sprach : » Auf der Erde soll es grüne...,LUTH1545,Und Gott sprach : Es lasse die Erde aufgehen G...,34,45,0.146067,VELHO
2,HOF,1%20Mose,1,2,"Noch war die Erde leer und ungestaltet , von t...",LUTH1545,"Und die Erde war wüst und leer , und es war fi...",25,25,0.346939,VELHO
3,HOF,1%20Mose,1,3,"Da sprach Gott : » Licht soll entstehen ! « , ...",LUTH1545,Und Gott sprach : Es werde Licht ! und es ward...,17,13,0.212121,VELHO
4,HOF,1%20Mose,1,4,"Gott sah , dass es gut war . Er trennte das Li...",LUTH1545,"Und Gott sah , daß das Licht gut war . Da schi...",15,18,0.485714,VELHO
...,...,...,...,...,...,...,...,...,...,...,...
7733,LUTH1545,Offenbarung,22,17,Und der Geist und die Braut sprechen : Komm ! ...,NGU-DE,Od Und wer dieser Einladung folgt . Nach diese...,40,33,0.088608,NOVO
7734,LUTH1545,Offenbarung,22,18,"Ich bezeuge allen , die da hören die Worte der...",NGU-DE,W bezeuge . - Offenbarung,35,5,0.028986,NOVO
7735,LUTH1545,Offenbarung,22,19,Und so jemand davontut von den Worten des Buch...,NGU-DE,W dem wird Gott seinen Teil am Baum des Lebens...,35,27,0.391304,NOVO
7736,LUTH1545,Offenbarung,22,20,"Es spricht , der solches bezeugt : Ja , ich ko...",NGU-DE,"Od Der , der uns als Gottes Zeuge alle diese D...",21,25,0.102041,NOVO


In [11]:
df_geral.dropna(inplace=True)
df_geral = df_geral[(df_geral.sourceLen >= 5) & (df_geral.targetLen >= 5) ]

In [None]:
selRows = df_geral[(df_geral["livro"] == "Matthäus") & (df_geral["capitulo"] == 1)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 16)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Lukas") & (df_geral["capitulo"] == 3)  & (df_geral["versiculo"] >= 23) & (df_geral["versiculo"] <= 38)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Genèse") & (df_geral["capitulo"] == 5)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 32)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Genèse") & (df_geral["capitulo"] == 10)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 32)].index
df_geral.drop(selRows, axis=0,inplace=True)


filter = df_geral['texto_x'].str.contains("2%20Chroniquese")
df_geral = df_geral[~filter]
filter = df_geral['texto_x'].str.contains("1%20Chroniques")
df_geral = df_geral[~filter]


filter = df_geral['texto_x'].str.contains("Copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_x'].str.contains("®")
df_geral = df_geral[~filter]


print(df_geral["sourceLen"].mean())
print(df_geral["targetLen"].mean())
print(df_geral["overlap"].mean())

In [None]:
df_geral

In [None]:
df_geral.to_csv("./data/FILTERED/german.tsv", sep="\t")