In [1]:
!pip install sacremoses



In [2]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
csv_path = '../data/gutenberg-homophone-errors.csv'  # Update this path
data = pd.read_csv(csv_path)


In [3]:
data.head()

Unnamed: 0,sentences,has_homophone,is_error,error_idx,error,correct_word,correct_sentence
0,﻿the project gutenberg ebook of frankenstein; ...,True,True,33.0,know,no,﻿the project gutenberg ebook of frankenstein; ...
1,"you may copy it, give it away or re-use it und...",True,False,,,,"you may copy it, give it away or re-use it und..."
2,"if you are not located in the united states,yo...",True,True,18.0,wear,where,"if you are not located in the united states,yo..."
3,"petersburgh, dec. 11th, 17—.you will rejoice t...",True,True,7.0,here,hear,"petersburgh, dec. 11th, 17—.you will rejoice t..."
4,"i arrived here yesterday, and my first task is...",True,False,,,,"i arrived here yesterday, and my first task is..."


In [4]:
# Function to mask homophones
def mask_homophones(row, mask_token='<mask>'):
    words = row['sentences'].split()
    if row['is_error'] and row['error_idx'] < len(words):
        words[int(row['error_idx'])] = mask_token
        return ' '.join(words)
    return row['sentences']

# Apply the function and prepare the datasets
data['masked_sentences'] = data.apply(mask_homophones, axis=1)

# Splitting the dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-mlm-en-2048")

# Tokenize the sentences
train_encodings = tokenizer(train_data['masked_sentences'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data['masked_sentences'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['masked_sentences'].tolist(), truncation=True, padding=True)

# Data collator for dynamic masking
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)


In [7]:
import torch

class HomophoneDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Convert the tokenized sentences to the dataset
train_dataset = HomophoneDataset(train_encodings)
val_dataset = HomophoneDataset(val_encodings)


In [8]:
from transformers import AutoModelForMaskedLM, Trainer, TrainingArguments

# Load the model
model = AutoModelForMaskedLM.from_pretrained("xlm-mlm-en-2048")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)


loading configuration file config.json from cache at C:\Users\miran/.cache\huggingface\hub\models--xlm-mlm-en-2048\snapshots\6eb6401a142611ae90f3d6bc606b97384f1c9961\config.json
Model config XLMConfig {
  "_name_or_path": "xlm-mlm-en-2048",
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "bos_token_id": 0,
  "causal": false,
  "dropout": 0.1,
  "emb_dim": 2048,
  "embed_init_std": 0.02209708691207961,
  "end_n_top": 5,
  "eos_index": 1,
  "gelu_activation": true,
  "init_std": 0.02,
  "is_encoder": true,
  "lang_id": 0,
  "layer_norm_eps": 1e-12,
  "mask_index": 5,
  "mask_token_id": 0,
  "max_position_embeddings": 512,
  "model_type": "xlm",
  "n_heads": 16,
  "n_langs": 1,
  "n_layers": 12,
  "pad_index": 2,
  "pad_token_id": 2,
  "sinusoidal_embeddings": false,
  "start_n_top": 5,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "first",
  "summary_use_

In [None]:
# Train the model
trainer.train()


***** Running training *****
  Num examples = 36476
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6840
  Number of trainable parameters = 667119041


In [None]:
import torch

def evaluate_model(model, tokenizer, data):
    model.eval()  # Set the model to evaluation mode
    total, correct = 0, 0

    for _, row in data.iterrows():
        masked_sentence = mask_homophones(row)
        inputs = tokenizer(masked_sentence, return_tensors='pt')
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        predicted_token = tokenizer.decode(predictions[0]).split()[int(row['error_idx'])]
        
        if predicted_token == row['correct_word']:
            correct += 1
        total += 1

    return correct / total

# Evaluate the model
accuracy = evaluate_model(model, tokenizer, test_data)
print(f"Accuracy: {accuracy}")


In [None]:
# Save the model after training
model.save_pretrained('./trained_model')

In [None]:
# Load the fine-tuned model
model = AutoModelForMaskedLM.from_pretrained('./trained_model')

# Perform evaluation using the evaluate_model function as previously defined
accuracy = evaluate_model(model, tokenizer, test_data)
print(f"Accuracy: {accuracy}")
