In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from collections import defaultdict, Counter
import torch.optim as optim
import ast

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
special_char_map = {
    'g': ['g', 'ğ'],
    'u': ['u', 'ü'],
    's': ['s', 'ş'],
    'i': ['i', 'ı'],
    'o': ['o', 'ö'],
    'c': ['c', 'ç'],
}

In [5]:
# Function to convert stringified lists back to actual lists
def string_to_list(string):
    return ast.literal_eval(string)

# Creating a vocabulary
def create_vocab(data):
    vocab = {}
    vocab['<pad>'] = 0  # Reserved word for padding
    vocab['<unk>'] = 1  # Reserved word for unknown words
    idx = 2  # Starting index for other tokens

    for tokens in data:
        for token in tokens:
            if token not in vocab:
                vocab[token] = idx
                idx += 1  # Increment the index for each new token

    return vocab



# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

In [6]:
def append_eos(tokens, vocab):
    tokens.append('<eos>')  # Append <eos> token to each sequence
    return tokens


def prepend_sos(tokens, vocab):
    tokens.insert(0, '<sos>')
    return tokens

In [7]:
# Load data
source_data = pd.read_csv('/content/drive/MyDrive/NLP/source.csv').iloc[:100, :]
cleaned_data = pd.read_csv('/content/drive/MyDrive/NLP/cleaned.csv').iloc[:100, :]

In [8]:
# Assume these columns contain the tokenized data as string representations
source_data['token_list'] = source_data['char_tokens'].apply(string_to_list)
cleaned_data['token_list'] = cleaned_data['char_tokens'].apply(string_to_list)

In [9]:
# Combine all tokens from both datasets
all_tokens = pd.concat([source_data['token_list'], cleaned_data['token_list']])
vocab = create_vocab(all_tokens)

In [10]:
print(f"Vocabulary Size: {len(vocab)}")

Vocabulary Size: 41


In [11]:
# Create inverse vocabulary
inv_vocab = {idx: char for char, idx in vocab.items()}

In [12]:
# Function to convert tokens to indices based on the vocabulary
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

# Convert tokens to indices using the shared vocabulary
source_data['token_indices'] = source_data['token_list'].apply(lambda x: tokens_to_indices(x, vocab))
cleaned_data['token_indices'] = cleaned_data['token_list'].apply(lambda x: tokens_to_indices(x, vocab))

# Convert token indices to PyTorch tensors and pad them
source_sequences = pad_sequence([torch.tensor(seq) for seq in source_data['token_indices']], batch_first=True, padding_value=vocab['<pad>'])
target_sequences = pad_sequence([torch.tensor(seq) for seq in cleaned_data['token_indices']], batch_first=True, padding_value=vocab['<pad>'])

# Output the shape to confirm the padding
print(source_sequences.shape, target_sequences.shape)

torch.Size([100, 330]) torch.Size([100, 330])


In [13]:
# Create TensorDatasets
dataset = TensorDataset(source_sequences, target_sequences)

# Split data into training, validation, and test sets
train_ratio, val_ratio, test_ratio = 0.9, 0.1, 0.0
total_size = len(dataset)
train_size = int(total_size * train_ratio)
val_size = int(total_size * val_ratio)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DiacriticModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(DiacriticModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim * 2, num_layers, batch_first=True)  # *2 for bidirectional
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)  # *2 for bidirectional

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        encoder_output, encoder_hidden = self.encoder(embedded, hidden)
        decoder_hidden = self.transform_hidden(encoder_hidden)
        output, hidden = self.decoder(embedded, decoder_hidden)
        output = self.fc(output)
        return output, hidden

    def transform_hidden(self, encoder_hidden):
        # encoder_hidden contains both the hidden and cell states
        # Each is a tuple (h_n, c_n) of shape [num_layers * num_directions, batch, hidden_size]
        h_n, c_n = encoder_hidden
        # Concatenate the hidden states for the forward and backward layers
        h_n = torch.cat([h_n[0:h_n.size(0):2], h_n[1:h_n.size(0):2]], dim=2)
        c_n = torch.cat([c_n[0:c_n.size(0):2], c_n[1:c_n.size(0):2]], dim=2)
        return (h_n, c_n)

    def init_hidden(self, batch_size):
        # Initialize hidden state with zeros
        # Note: We multiply layer_dim by 2 because of the bidirectional LSTM
        return (torch.zeros(self.num_layers * 2, batch_size, self.hidden_dim).to(device),
                torch.zeros(self.num_layers * 2, batch_size, self.hidden_dim).to(device))


In [15]:
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, total_size - train_size - val_size])

# Create DataLoader for batch processing
batch_size = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [16]:
# Example usage
model = DiacriticModel(len(vocab), 256, 512, batch_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
import time
import torch

def train_model(model, train_loader, val_loader, epochs, criterion, optimizer, save_path):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        start_time = time.time()

        for i, (data, targets) in enumerate(train_loader):
            data, targets = data.to(device), targets.to(device)  # Move data to the same device as the model

            optimizer.zero_grad()
            hidden = model.init_hidden(data.size(0))
            output, hidden = model(data, hidden)
            loss = criterion(output.view(-1, len(vocab)), targets.view(-1))
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if (i + 1) % 100 == 0:
                print(f'Epoch {epoch + 1}, Batch {i + 1}, Average Loss: {running_loss / 100:.4f}')
                running_loss = 0.0

        # Validation step
        model.eval()
        with torch.no_grad():
            total_loss = 0
            total_batches = 0
            for data, targets in val_loader:
                data, targets = data.to(device), targets.to(device)
                hidden = model.init_hidden(data.size(0))
                output, hidden = model(data, hidden)
                loss = criterion(output.view(-1, len(vocab)), targets.view(-1))
                total_loss += loss.item()
                total_batches += 1
            avg_val_loss = total_loss / total_batches
            print(f'Epoch {epoch + 1}, Validation Loss: {avg_val_loss:.4f}')

        end_time = time.time()
        print(f'Epoch {epoch + 1} completed in {(end_time - start_time):.2f} seconds.')

        # Save the model
        torch.save(model.state_dict(), f'{save_path}/model_epoch_{epoch+1}.pth')
        print(f'Model saved to {save_path}/model_epoch_{epoch+1}.pth')

# Example usage
save_path = '/content/drive/MyDrive/NLP'  # Set this to your desired saving path
train_model(model, train_loader, val_loader, 20, criterion, optimizer, save_path)

KeyboardInterrupt: 

In [None]:
def predict_and_compare(model, data_loader, inv_vocab, num_examples=10):
    model.eval()
    data_iter = iter(data_loader)
    data, targets = next(data_iter)
    data, targets = data.to(device), targets.to(device)

    # We only use the first 'num_examples' from the batch for demonstration
    data = data[:num_examples]
    targets = targets[:num_examples]

    with torch.no_grad():
        hidden = model.init_hidden(data.size(0))
        output, _ = model(data, hidden)
        predictions = output.argmax(dim=2).cpu().numpy()

    # Convert indices to characters using the inverse vocabulary
    predictions = [''.join([inv_vocab[idx] for idx in pred]) for pred in predictions]
    targets = [''.join([inv_vocab[idx] for idx in target]) for target in targets.cpu().numpy()]
    input_sequences = [''.join([inv_vocab[idx.item()] for idx in sequence]) for sequence in data]

    for i in range(num_examples):
        print(f"Input: {input_sequences[i]}")
        print(f"Predicted: {predictions[i]}")
        print(f"Actual: {targets[i]}")
        print("----" * 10)

predict_and_compare(model, train_loader, inv_vocab, num_examples=5)

Input:  peki erken rezervasyon yapan tatilcinin cuzdaninda ne kadar kalacak <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

IndexError: list index out of range