In [1]:
import pandas as pd
import os
#cerrega todo os arquivos
dfsNew = []
dfsOld = []

for arq in os.listdir("./data/FR/"):
    if arq.find("New") > 0:
        dfsNew.append(pd.read_csv("./data/FR/"+arq, sep="\t"))
    elif arq.find("Old") > 0:
        dfsOld.append(pd.read_csv("./data/FR/"+arq, sep="\t"))


In [2]:
from nltk import ngrams
from sacremoses import MosesTokenizer

def tokenize(text, lang, nGram=3):
    entok = MosesTokenizer(lang=lang)
    text = entok.tokenize(text, escape=False)
    grams = []
    for i in range(1, nGram):
        i_grams = [
            " ".join(gram)
            for gram in ngrams(text, i)
        ]
        grams.extend(i_grams)
        
    return grams

def getNgramOverlap(hypothesys, references, nGram, lang):

  overlaps = []
  for h, r in zip([hypothesys], [references]):
    if (h == "") or (r == ""):
      overlaps.append(1.0)
      continue
    a = tokenize(h, lang, nGram)
    b = tokenize(r, lang, nGram)

    if len(a) >= len(b) and len(a) > 0:
      overlaps.append(len(set(a) & set(b))/len(a))
    elif len(b) >= len(a) and len(b) > 0:
      overlaps.append(len(set(a) & set(b))/len(b))
    elif len(a) == 0 or len(b) == 0:
      overlaps.append(0)

  return overlaps[0]

import numpy as np

def getStats(dfA, dfB, lang):
    df = dfA.merge(dfB, on=["livro", "capitulo", "versiculo"])

    df["sourceLen"] = df["texto_x"].apply(lambda x: getSizeSentece(x))
    df["targetLen"] = df["texto_y"].apply(lambda x: getSizeSentece(x))
    
    df["overlap"] = df.apply(lambda x: getNgramOverlap(x["texto_x"], x["texto_y"], 3, lang), axis=1)

    return df
#Matriz do novo testamento

def getSizeSentece(text):
    try:
        return len(text.split(" "))
    except:
        return 0
    

In [3]:
print("Novo testamento: ")
import copy
concat_df_new = []
new = copy.deepcopy(dfsNew)
for dfA in new:
    new = new[1:]
    for dfB in new:
             
        print(set(dfA["livro"]) - set(dfB["livro"]))
        concat_df_new.append(getStats(dfA, dfB, "fr"))

Novo testamento: 
set()
set()
set()
set()
set()
set()


In [4]:
len(dfsNew)

4

In [5]:
dfConcat_novo = pd.concat(concat_df_new, ignore_index=False)
print("tamanho sem cortes: ", dfConcat_novo.shape)
dfConcat_novo["VERSAO"] = "NOVO"

tamanho sem cortes:  (46542, 10)


In [6]:
print("Velho testamento: ")
import copy
concat_df_old = []
old = copy.deepcopy(dfsOld)
for dfA in old:
    old = old[1:]
    for dfB in old:
             
        print(set(dfA["livro"]) - set(dfB["livro"]))
        concat_df_old.append(getStats(dfA, dfB, "fr"))

Velho testamento: 
set()
set()
set()
set()
set()
set()


In [7]:
len(dfsOld)

4

In [8]:
dfConcat_old = pd.concat(concat_df_old, ignore_index=False)
print("tamanho sem cortes: ", dfConcat_old.shape)
dfConcat_old["VERSAO"] = "VELHO"

tamanho sem cortes:  (136061, 10)


In [9]:
df_geral = pd.concat([dfConcat_old,dfConcat_novo])

print(df_geral["sourceLen"].mean())
print(df_geral["targetLen"].mean())
print(df_geral["overlap"].mean())

31.501065152270225
27.288500188934464
0.39438657302337643


In [10]:
df_geral

Unnamed: 0,estilo_x,livro,capitulo,versiculo,texto_x,estilo_y,texto_y,sourceLen,targetLen,overlap,VERSAO
0,BDS,Genèse,1,1,"Au commencement , Dieu créa le ciel et la terre .",LSG,"Au commencement , Dieu créa les cieux et la te...",11,11,0.761905,VELHO
1,BDS,Genèse,1,2,"Or , la terre était chaotique et vide . Les té...",LSG,La terre était informe et vide : il y avait de...,27,30,0.338028,VELHO
2,BDS,Genèse,1,3,Et Dieu dit alors : Que la lumière soit ! Et l...,LSG,Dieu dit : Que la lumière soit ! Et la lumière...,18,13,0.425532,VELHO
3,BDS,Genèse,1,4,"Dieu vit que la lumière était bonne , et il sé...",LSG,Dieu vit que la lumière était bonne ; et Dieu ...,16,18,0.463415,VELHO
4,BDS,Genèse,1,5,Il appela la lumière : « jour » et les ténèbre...,LSG,"Dieu appela la lumière jour , et il appela les...",34,34,0.507463,VELHO
...,...,...,...,...,...,...,...,...,...,...,...
7757,NEG1979,Apocalypse,22,17,Et l ’ Esprit et l ’ épouse disent : Viens . E...,SG21,L&apos; Esprit et l&apos; épouse disent : « Vi...,44,42,0.376238,NOVO
7758,NEG1979,Apocalypse,22,18,Je le déclare à quiconque entend les paroles d...,SG21,Je le déclare à toute personne qui écoute les ...,34,34,0.575342,NOVO
7759,NEG1979,Apocalypse,22,19,et si quelqu ’ un retranche quelque chose des ...,SG21,et si quelqu&apos; un enlève quelque chose aux...,38,35,0.617284,NOVO
7760,NEG1979,Apocalypse,22,20,"Celui qui atteste ces choses dit : Oui , je vi...",SG21,"Celui qui atteste ces choses dit : « Oui , je ...",20,22,0.813953,NOVO


In [11]:
df_geral.dropna(inplace=True)
df_geral = df_geral[(df_geral.sourceLen >= 5) & (df_geral.targetLen >= 5) ]

In [12]:

selRows = df_geral[(df_geral["livro"] == "Matthieu") & (df_geral["capitulo"] == 1)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 16)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Luc") & (df_geral["capitulo"] == 3)  & (df_geral["versiculo"] >= 23) & (df_geral["versiculo"] <= 38)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Genèse") & (df_geral["capitulo"] == 5)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 32)].index
df_geral.drop(selRows, axis=0,inplace=True)

selRows = df_geral[(df_geral["livro"] == "Genèse") & (df_geral["capitulo"] == 10)  & (df_geral["versiculo"] >= 1) & (df_geral["versiculo"] <= 32)].index
df_geral.drop(selRows, axis=0,inplace=True)


filter = df_geral['livro'].str.contains("2%20Chroniquese")
df_geral = df_geral[~filter]
filter = df_geral['livro'].str.contains("1%20Chroniques")
df_geral = df_geral[~filter]


filter = df_geral['texto_y'].str.contains("copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_y'].str.contains("Copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_y'].str.contains("®")
df_geral = df_geral[~filter]

filter = df_geral['texto_x'].str.contains("copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_x'].str.contains("Copyright")
df_geral = df_geral[~filter]
filter = df_geral['texto_x'].str.contains("®")
df_geral = df_geral[~filter]


print(df_geral["sourceLen"].mean())
print(df_geral["targetLen"].mean())
print(df_geral["overlap"].mean())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_geral.drop(selRows, axis=0,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_geral.drop(selRows, axis=0,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_geral.drop(selRows, axis=0,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_geral.drop(selRows, axis=0,inplac

32.20061936790714
29.506284571636325
0.43793781985630387


In [13]:
df_geral

Unnamed: 0,estilo_x,livro,capitulo,versiculo,texto_x,estilo_y,texto_y,sourceLen,targetLen,overlap,VERSAO
16,BDS,Genèse,1,17,Il les plaça dans l ’ étendue du ciel pour ill...,LSG,"Dieu les plaça dans l&apos; étendue du ciel , ...",14,14,0.545455,VELHO
17,BDS,Genèse,1,18,"pour présider au jour ainsi qu ’ à la nuit , e...",LSG,"pour présider au jour et à la nuit , et pour s...",26,26,0.578947,VELHO
18,BDS,Genèse,1,19,"Il y eut un soir , il y eut un matin : ce fut ...",LSG,"Ainsi , il y eut un soir , et il y eut un mati...",18,21,0.682927,VELHO
19,BDS,Genèse,1,20,Puis Dieu dit : Que les eaux foisonnent d ’ un...,LSG,Dieu dit : Que les eaux produisent en abondanc...,33,27,0.507692,VELHO
20,BDS,Genèse,1,21,Alors Dieu créa chaque espèce de grands animau...,LSG,Dieu créa les grands poissons et tous les anim...,41,42,0.349398,VELHO
...,...,...,...,...,...,...,...,...,...,...,...
7756,NEG1979,Apocalypse,22,16,"Moi Jésus , j ’ ai envoyé mon ange pour vous a...",SG21,"Moi Jésus , j&apos; ai envoyé mon ange pour vo...",35,37,0.647059,NOVO
7757,NEG1979,Apocalypse,22,17,Et l ’ Esprit et l ’ épouse disent : Viens . E...,SG21,L&apos; Esprit et l&apos; épouse disent : « Vi...,44,42,0.376238,NOVO
7758,NEG1979,Apocalypse,22,18,Je le déclare à quiconque entend les paroles d...,SG21,Je le déclare à toute personne qui écoute les ...,34,34,0.575342,NOVO
7759,NEG1979,Apocalypse,22,19,et si quelqu ’ un retranche quelque chose des ...,SG21,et si quelqu&apos; un enlève quelque chose aux...,38,35,0.617284,NOVO


In [14]:
df_geral.to_csv("./data/FILTERED/french.tsv", sep="\t")