In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import pickle

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

file_path = "./drive/MyDrive/Colab Notebooks/base_turkish_misspellings.csv"
data = pd.read_csv(file_path)
wrong_words = data["wrong"].values
correct_words = data["correct"].values

EOS_TOKEN = "<EOS>"
all_chars = set("".join(wrong_words) + "".join(correct_words)) | {EOS_TOKEN}
char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(all_chars))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

input_dim = len(char_to_idx) + 1
hidden_dim = 100
output_dim = len(char_to_idx) + 1
batch_size = 32
epochs = 100

max_length = max(max(len(word) for word in wrong_words), max(len(word) for word in correct_words)) + 1

def encode_word(word, char_to_idx, max_length):
    encoded = [char_to_idx[char] for char in word] + [char_to_idx[EOS_TOKEN]]
    return encoded + [0] * (max_length - len(encoded))

def decode_sequence(sequence, idx_to_char):
    result = []
    for idx in sequence:
        if idx == 0:
            continue
        char = idx_to_char[idx]
        if char == EOS_TOKEN:
            break
        result.append(char)
    return "".join(result)

class SpellCorrectorDataset(Dataset):
    def __init__(self, wrong_words, correct_words):
        self.wrong_words = torch.LongTensor([encode_word(w, char_to_idx, max_length) for w in wrong_words])
        self.correct_words = torch.LongTensor([encode_word(c, char_to_idx, max_length) for c in correct_words])

    def __len__(self):
        return len(self.wrong_words)

    def __getitem__(self, idx):
        return self.wrong_words[idx], self.correct_words[idx]

class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.hidden_dim = hidden_dim
        self.W1 = nn.Linear(hidden_dim, hidden_dim)
        self.W2 = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)

    def forward(self, encoder_output, decoder_output):
        score = self.V(torch.tanh(self.W1(encoder_output) + self.W2(decoder_output)))
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * encoder_output
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector, attention_weights

class SpellCorrectorModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SpellCorrectorModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim, padding_idx=0)
        self.bilstm1 = nn.LSTM(hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.bilstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
        self.attention = AttentionLayer(hidden_dim * 2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 4, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        encoder_output, _ = self.bilstm1(x)
        encoder_output = self.dropout(encoder_output)
        encoder_output, _ = self.bilstm2(encoder_output)
        encoder_output = self.dropout(encoder_output)

        context_vector, attention_weights = self.attention(encoder_output, encoder_output)
        context_vector = context_vector.unsqueeze(1)
        context_vector = context_vector.repeat(1, encoder_output.size(1), 1)

        decoder_input = torch.cat([encoder_output, context_vector], dim=-1)
        output = self.fc(decoder_input)
        return torch.log_softmax(output, dim=-1)

model = SpellCorrectorModel(input_dim, hidden_dim, output_dim).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

dataset = SpellCorrectorDataset(wrong_words, correct_words)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

def train():
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (wrong, correct) in enumerate(train_loader):
            wrong, correct = wrong.to(device), correct.to(device)

            optimizer.zero_grad()
            output = model(wrong)

            loss = criterion(output.view(-1, output_dim), correct.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch: {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')

def correct_word(word):
    model.eval()
    with torch.no_grad():
        encoded = torch.LongTensor([encode_word(word, char_to_idx, max_length)]).to(device)
        output = model(encoded)
        predicted_indices = torch.argmax(output, dim=-1).cpu().numpy().squeeze()
        return decode_sequence(predicted_indices, idx_to_char)

In [None]:
import json

parameters = {
    "char_to_idx": char_to_idx,
    "idx_to_char": idx_to_char,
    "max_length": max_length,
    "eos_token":EOS_TOKEN,
    "hidden_dim":hidden_dim,
}

with open("params.json", "w") as f:
    json.dump(parameters, f,indent=4)
