# Creating the dataset

In [4]:
import pandas as pd
import random
# !pip install datasets
import datasets as ds
from datasets import Dataset

def random_replace(string, default_prob):
    replacements = {
        'א': [('ע', default_prob), ('ה', default_prob)],
        'ע': [('א', default_prob), ('ה', default_prob)],
        'ה': [('א', default_prob), ('ע', default_prob)],

        'ט': [('ת', default_prob)],
        'ת': [('ט', default_prob)],

        'ח': [('כ', default_prob)],
        'כ': [('ח', default_prob), ('ק', default_prob)],
        'ק': [('כ', default_prob)],

        'ש': [('ס', default_prob / 2)],
        'ס': [('ש', default_prob / 2)],

        'ב': [('ו', default_prob / 4)],
        'ו': [('ב', default_prob / 4)],

        'לא ': ('לו ', default_prob),
        'לו ': [('לא ', default_prob)]
    }

    # Convert string to list to make replacements
    string_list = list(string)
    for idx, char in enumerate(string_list):
        if char in replacements:
            for replacement, prob in replacements[char]:
                if random.random() < prob:  # Unique probability for each replacement
                    string_list[idx] = replacement
                    break  # Stop after the first replacement
    return ''.join(string_list)


def create_augmentations(percentage=30, verbose=True):
    default_prob = float(percentage) / 100
    input_txt_path = 'datasets/hebrew_text.txt'
    output_path = 'datasets/hebrew_text_aug_' + str(percentage)

    # Read the input TXT file
    with open(input_txt_path, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()

    # Process each line
    processed_lines = []
    for line in lines:
        line = line.strip()
        modified_line = random_replace(line, default_prob)
        processed_lines.append(f"{line}\t{modified_line}")

    if verbose:
        print(f'-----------> Example:\n\n')
        print(processed_lines[1])
        print(f'<-----------= Example:\n\n')

    # Save data in txt format - uncomment to activate
    # # Write the original and modified text to the output TXT file
    # output_txt_path = output_path + '.txt'
    # with open(output_txt_path, 'w', encoding='utf-8') as outfile:
    #     outfile.write('\n'.join(processed_lines))
    #
    # print(f"Modified data saved to {output_txt_path}")

    print(f'\nExporting the data to Excel file')

    processed_lines = processed_lines[1:]
    data = [line.strip().split('\t') for line in processed_lines]
    df = pd.DataFrame(data, columns=['original', 'errors'])  # Adjust column names as needed
    excel_output_path = output_path + '.xlsx'
    df.to_excel(excel_output_path, index=False, engine='openpyxl')

    print(f"Conversion complete. Check {excel_output_path}")
    return excel_output_path


def export_dataset(excel_path):
    df = pd.read_excel(excel_path)
    df.dropna(subset=['errors', 'original'], inplace=True)
    texts_with_errors = df['errors'].tolist()
    texts_corrected = df['original'].tolist()

    data_dict = {
        'errors': texts_with_errors,
        'original': texts_corrected
    }

    # dataset = ds.Dataset.from_dict(data_dict)
    dataset = Dataset.from_dict(data_dict)

    return dataset


def full_run(percentage=30, verbose=True):
    return export_dataset(create_augmentations(percentage, verbose))




In [2]:
import pandas as pd
from datasets import Dataset
import tensorflow as tf
# from create_augmentations import *
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
# from datasets import load_from_disk
import os

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Preparing the dataset
# -------------------------------------->
# dataset = full_run()
dataset = export_dataset('datasets/hebrew_text_aug_30.xlsx')

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


def tokenize_function(examples):
    inputs = tokenizer(examples['errors'], padding="max_length", truncation=True)
    labels = tokenizer(examples['original'], padding="max_length", truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs


tokenized_dataset_path = 'datasets/tokenized_dataset'
if os.path.exists(tokenized_dataset_path):
    tokenized_datasets = load_from_disk(tokenized_dataset_path)
else:
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    # Saving the value, so we won't need to create it each time
    tokenized_datasets.save_to_disk(tokenized_dataset_path)

tokenized_datasets = tokenized_datasets.remove_columns(['errors', 'original'])
tokenized_datasets.set_format('tensorflow')

# Convert to TensorFlow dataset
def dataset_to_tf(dataset):
    features = {x: tf.convert_to_tensor(dataset[x], dtype=tf.int64) for x in dataset.column_names}
    return tf.data.Dataset.from_tensor_slices((features, features['labels']))


tf_train_dataset = dataset_to_tf(tokenized_datasets)

# We will now convert our dataset to torch format:
# x =



In [None]:

# Model
# -------------------------------------->
# model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
#
# # Prepare dataset for training
# BATCH_SIZE = 16
# SHUFFLE_BUFFER_SIZE = 1000
#
# tf_train_dataset = tf_train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
#
# # Compile the model
# optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
# model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
#
# # Train the model
# model.fit(tf_train_dataset, epochs=3)
#
# # Assume you have a separate test dataset loaded and processed similarly
# tf_test_dataset = dataset_to_tf(tokenized_test_datasets)
# tf_test_dataset = tf_test_dataset.batch(BATCH_SIZE)
#
# # Evaluate the model
# results = model.evaluate(tf_test_dataset)
# print(results)
#
# def correct_spelling(text):
#     inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
#     outputs = model(inputs)
#     predictions = tf.argmax(outputs.logits, axis=-1)
#     return tokenizer.decode(predictions[0], skip_special_tokens=True)
#
# corrected_text = correct_spelling("text_with_spelling_mistakes")
# print(corrected_text)
#
