In [1]:
import json
import torch
from transformers import MT5ForConditionalGeneration, AutoTokenizer
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from config import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

with open(bk_en_dir, 'r') as f:
    en = json.load(f)
    
with open(bk_he_dir, 'r') as f:
    he = json.load(f)

In [3]:
# print text
print(he['text'][4][4])
# remove <b> tags
print(en['text'][4][4].replace('<b>', '').replace('</b>', ''))

והשתא דאוקימנא ארגל שן דלא מכליא קרנא מנלן
The Gemara asks: And now that we have interpreted that the phrase “and he sends forth” is referring to Trampling, from where do we derive that one is liable with regard to acts categorized as Eating in a case where the object damaged is not completely destroyed? The primary category of Eating is derived from the phrase “and it consumed.” The connotation of that phrase is damage in which the object is completely destroyed.


In [10]:
import re

def remove_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text


In [29]:
import os

def get_folders(path):
    folders = []
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path):
            folders.append(item_path)
    return folders

folders = get_folders('/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim')
folders.sort()
folders

['/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Arakhin',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Bekhorot',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Chullin',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Keritot',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Meilah',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Menachot',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Tamid',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Temurah',
 '/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Kodashim/Zevachim']

In [36]:
['Beitzah','Chagigah','Eruvin','Moed Katan','Beitzah','Chagigah','Eruvin','Moed Katan','Pesachim','Rosh Hashanah','Shabbat','Yoma','Gittin','Kiddushin','Nazir','Nedarim','Sotah','Yevamot','Bava Batra','Bava Kamma','Bava Metzia']
complete = ['Megillah','Sukkah','Ketubot','Makkot']
total_he = []
total_en = []
for seder in ['Seder Kodashim','Seder Moed','Seder Nashim','Seder Nezikin','Seder Tahorot']:#,'Seder Zeraim'
    mesechtas = get_folders('/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/{seder}'.format(seder=seder))
    mesechtas.sort()
    for mesechta in mesechtas:
        if mesechta.split('/')[-1] not in complete:
            continue
        print(mesechta)
        with open(os.path.join(mesechta,'English','merged.json'), 'r') as f:
            en = json.load(f)
            
        with open(os.path.join(mesechta,'Hebrew','merged.json'), 'r') as f:
            he = json.load(f)

        en_text = [remove_html_tags(text) for chapter in en['text'] for text in chapter] #if text]
        he_text = [remove_html_tags(text) for chapter in he['text'] for text in chapter] #if text]
        total_en += en_text
        total_he += he_text
        print(len(en_text),len(he_text))
        print(en_text.count(""),he_text.count(""))
        print(len(total_en),len(total_he))
        print('-----------------------')
        print(total_en[-1])
        print(total_he[-1])

# # Combine and preprocess the data
# he_texts = [remove_html_tags(text) for chapter in he['text'] for text in chapter if text]
# # remove all tags from english text <b> </b> and <i> </i>
# en_texts = [remove_html_tags(text) for chapter in en['text'] for text in chapter if text]



data = list(zip(total_he, total_en))

# Load the model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

# Send the model to GPU if available
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)

def preprocess_function(example):
    source_text, target_text = example
    inputs = tokenizer("translate Hebrew to English: " + source_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(target_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

    inputs = {k: v.squeeze(0) for k, v in inputs.items()}
    targets = {k: v.squeeze(0) for k, v in targets.items()}

    return {**inputs, "labels": targets["input_ids"]}



#pip install torch==2.1.0.dev20230328+cu117 -f https://download.pytorch.org/whl/nightly/cu117/torch_nightly.html transformers datasets pandas protobuf==3.20 sentencepiece




/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Moed/Megillah
1213 1213
0 0
1213 1213
-----------------------
May we return to you chapter “The people of the city” and this is the completion of Tractate Megillah.
הדרן עלך בני העיר וסליקא לה מסכת מגילה
/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Moed/Sukkah
1361 1361
0 0
2574 2574
-----------------------
We have now completed the chapter “the flute” and all of Masekhet Sukkah.
הדרן עלך החליל וסליקא לה מסכת סוכה
/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Nashim/Ketubot
3038 3038
0 0
5612 5612
-----------------------
May we return to you, chapter “the judges of decrees” and this is the completion

הדרן עלך שני דייני גזירות וסליקא לה מסכת כתובות
/scratch/zceemsi/DLNLP_assignment_23/Datasets/Sefaria-Export/json/Talmud/Bavli/Seder Nezikin/Makkot
709 709
0 0
6321 6321
-----------------------
We will return to you and we have



In [38]:
# Create dataset
preprocessed_data = [preprocess_function(example) for example in data]

In [39]:
dataset = Dataset.from_dict({k: [d[k] for d in preprocessed_data] for k in preprocessed_data[0]})

# Split dataset into train and validation
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",

)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()


# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_mt5_he_en")
tokenizer.save_pretrained("./finetuned_mt5_he_en")



Epoch,Training Loss,Validation Loss




('./finetuned_mt5_he_en/tokenizer_config.json',
 './finetuned_mt5_he_en/special_tokens_map.json',
 './finetuned_mt5_he_en/spiece.model',
 './finetuned_mt5_he_en/added_tokens.json',
 './finetuned_mt5_he_en/tokenizer.json')

In [40]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_mt5_he_en_new")
tokenizer.save_pretrained("./finetuned_mt5_he_en_new")

('./finetuned_mt5_he_en_new/tokenizer_config.json',
 './finetuned_mt5_he_en_new/special_tokens_map.json',
 './finetuned_mt5_he_en_new/spiece.model',
 './finetuned_mt5_he_en_new/added_tokens.json',
 './finetuned_mt5_he_en_new/tokenizer.json')

In [47]:

# Load the fine-tuned model and tokenizer
finetuned_model = MT5ForConditionalGeneration.from_pretrained("./finetuned_mt5_he_en_whole")
finetuned_tokenizer = AutoTokenizer.from_pretrained("./finetuned_mt5_he_en_whole")

# Generate translation
hebrew_text = "וכי תימא כח דהתירא עדיף ונפלוג בתרוייהו"
inputs = finetuned_tokenizer("translate Hebrew to English: " + hebrew_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
translated_tokens = finetuned_model.generate(**inputs, num_beams=5, max_length=128, early_stopping=True)
translation = finetuned_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print(translation)

The Gemara asks: But isn’t it taught in a baraita that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi Yoḥanan says that Rabbi


In [24]:
len(total_en)

78778

In [25]:
len(total_he)

79044

In [37]:
data[-1]

('הדרן עלך אלו הן הלוקין וסליקא לה מסכת מכות',
 'We will return to you and we have acquired Tractate Makkos. ')

In [13]:
preprocessed_data

[{'input_ids': tensor([ 37194,    259,  87206,    288,   5413,    267,   6094,   7668,   7771,
             259,  34725,    580,    882,  24713,    259, 219919,  92992,    465,
           27714,   4511,  29862,  44172,  12069,    580,    259,  34445,   7884,
               1,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
   