In [172]:
import numpy as np
import pandas as pd
import os
import string
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AdamW, BertModel, AutoConfig, AutoTokenizer
from torch.optim import Adam
import torch.nn as nn
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
import time;
import datetime
from sklearn.model_selection import train_test_split
from torchcrf import CRF

In [173]:
# !pip install pytorch-crf
# !pip install unidecode
# !pip install --upgrade transformers
# !pip install --upgrade torch

In [174]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [175]:
# nlp_train=pd.read_csv("/kaggle/input/nlp-project-train/train.csv", index_col=[0])
# nlp_test=pd.read_csv("/kaggle/input/nlp-project-train/test.csv",index_col=[0],encoding="windows-1252") 

nlp_train=pd.read_csv("train.csv", index_col=[0])
nlp_test=pd.read_csv("test.csv",index_col=[0],encoding="windows-1252") 
nlp_train = nlp_train[:100]

## Functions

In [176]:
def convert_to_ascii(sentence):
    text = unidecode(sentence)
    return text

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
        text = text.replace('  ', ' ')
    return text.strip()

def split_sentences(sentences, max_length=100):
    punctuations = {'.', ',', ';', ':'}
    results = []

    for sentence in sentences:
        while len(sentence) > max_length:
            # find last punctuation before max_length
            split_pos = -1
            for p in punctuations:
                pos = sentence.rfind(p, 0, max_length + 1)
                if pos > split_pos:
                    split_pos = pos
            
            # If no punctuation found, split at the last space before max_length
            if split_pos == -1:
                split_pos = sentence.rfind(' ', 0, max_length + 1)
            
            # If no space found, just split at max_length
            if split_pos == -1:
                split_pos = max_length
            
            # Append the split segment to results
            results.append(sentence[:split_pos + 1].strip())
            # Move the rest of the sentence forward
            sentence = sentence[split_pos + 1:].strip()
        
        # Append the remainder of the sentence if it's not empty
        if sentence:
            results.append(sentence)
    
    return np.array(results)


def encode_word(sentence):
    words = sentence.split()
    encoded_sentence = []
    for word in words:
        encoded_word = []
        for char in word:
            # print(char)
            if char in "ıöüğşç":
                encoded_word.append(2)
            else:
                encoded_word.append(1)
            # print(encoded_word)
        encoded_sentence.append(encoded_word)
    return encoded_sentence



def character_tokenizer(text):
    return [c if c != ' ' else '[SPACE]' for c in text]

def padding(text, filling_char, max_length):
    if type(text) is str:
        text = text + filling_char * (max_length - len(text))
    elif type(text) is list:
        text = text + [filling_char] * (max_length - len(text))
    return text

***Preprocess***

In [177]:
# Raw sentences
raw_sentences = nlp_train["Sentence"].values
# Split sentences
sentences = split_sentences(raw_sentences, max_length=200)
# Remove punctuations
sentences = [remove_punctuations(s).lower() for s in sentences]
nlp_train["Label"] = nlp_train["Sentence"]
# Apply convert to ascii to y_train
asci_sentences = [convert_to_ascii(s) for s in sentences]

sentences_diacritics = sentences.copy()
for sentence_index in range(len(sentences_diacritics)):
    sentence = sentences_diacritics[sentence_index]
    new_sentence = encode_word(sentence)
    sentences_diacritics[sentence_index] = new_sentence

***Split sentences to lists of characters***

In [178]:
# sentences_splitted = [character_tokenizer(s) for s in sentences]

# asci_sentences_splitted = [character_tokenizer(s) for s in asci_sentences]

In [179]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

In [180]:
indices=tokenizer.batch_encode_plus(sentences,max_length=200,add_special_tokens=True, return_attention_mask=True,padding=True,truncation=True)
input_ids=indices["input_ids"]
attention_masks=indices["attention_mask"]
print(input_ids[0])
print(raw_sentences[0])
print(attention_masks[0])

[2, 3825, 8725, 1992, 2416, 4456, 24513, 2525, 5292, 5953, 4165, 1996, 5538, 27202, 2293, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sınıf , havuz ve açık deniz çalışmalarıyla , tüm dünyada geçerli , başarılı bir standart oluşturmuştur . 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Paddings

In [181]:
for i in range(len(sentences_diacritics)):
    for j in range(len(sentences_diacritics[i])):
        sentences_diacritics[i][j] = padding(sentences_diacritics[i][j], 0, 30)

## Create dictionary to store inputs and labels

In [182]:
# Create dictionary to store inputs and labels
inputs = {
    "input_ids": input_ids,
    "attention_mask": attention_masks,
    "labels": sentences_diacritics
}

In [183]:
print(inputs["labels"][0])

[[1, 2, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 2, 1, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [184]:
def flatten_and_pad_labels(labels, pad_value=0, max_length=None):
    # If max_length is not specified, find the maximum length of any label sequence in the dataset
    if max_length is None:
        max_length = max(len(label) for sentence_labels in labels for label in sentence_labels)
    
    padded_labels = []
    for sentence_labels in labels:
        flat_labels = [label for word_labels in sentence_labels for label in word_labels]
        # Pad the flattened label sequence if it's shorter than max_length
        if len(flat_labels) < max_length:
            flat_labels += [pad_value] * (max_length - len(flat_labels))
        padded_labels.append(flat_labels[:max_length])  # Ensure the length matches max_length exactly
    return padded_labels

# Calculate the maximum sequence length from your input_ids to ensure label alignment
max_seq_length = max(len(seq) for seq in input_ids)

# Flatten and pad the labels
flat_padded_labels = flatten_and_pad_labels(inputs["labels"], pad_value=0, max_length=max_seq_length)

# Convert labels to tensor
labels_tensor = torch.tensor(flat_padded_labels, dtype=torch.long)


In [185]:
# Convert the input data and labels into torch tensors
input_ids = torch.tensor(inputs["input_ids"])
attention_masks = torch.tensor(inputs["attention_mask"])
labels = labels_tensor

# Create TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define DataLoaders
batch_size = 32  # you can adjust this size depending on your GPU capacity

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)


In [186]:
class DiacritizationModel(nn.Module):
    def __init__(self, num_labels):
        super(DiacritizationModel, self).__init__()
        self.bert = BertModel.from_pretrained("dbmdz/bert-base-turkish-cased")
        self.lstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(512, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        lstm_output, _ = self.lstm(sequence_output)
        logits = self.fc(lstm_output)
        
        if labels is not None:
            # Only return the loss if labels are provided
            loss = -self.crf(logits, labels, mask=attention_mask.byte())
            return loss
        else:
            # Return logits for decoding
            return logits


# Instantiate the model
model = DiacritizationModel(num_labels=3)
model.to(device)

DiacritizationModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [187]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 10

# Total number of training steps is the number of batches * number of epochs.
total_steps = len(train_dataloader) * num_epochs

# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)




In [188]:
def train(model, train_dataloader, validation_dataloader, epochs, optimizer, scheduler, device):
    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0

        for batch in train_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Clear previously calculated gradients
            model.zero_grad()

            # Forward pass (calculate current loss)
            loss = model(b_input_ids, b_input_mask, b_labels)

            # Accumulate the training loss over all batches
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient
            optimizer.step()

            # Update the learning rate
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)
        loss_values.append(avg_train_loss)

        print(f'Epoch {epoch + 1}/{epochs} | Loss: {avg_train_loss}')

        # Validation
        model.eval()
        eval_loss = 0
        eval_accuracy = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                # Forward pass, calculate logit predictions
                loss = model(b_input_ids, b_input_mask, b_labels)

            # Accumulate validation loss
            eval_loss += loss.item()

            nb_eval_steps += 1

        print(f'Validation Loss: {eval_loss / nb_eval_steps}')

    return loss_values



# Call the training function
loss_values = train(model, train_dataloader, validation_dataloader, num_epochs, optimizer, scheduler, device)

Epoch 1/10 | Loss: 508.00115966796875
Validation Loss: 194.9593963623047
Epoch 2/10 | Loss: 433.62608846028644
Validation Loss: 170.79214477539062
Epoch 3/10 | Loss: 385.9591471354167
Validation Loss: 154.65484619140625
Epoch 4/10 | Loss: 353.9159240722656
Validation Loss: 146.4486846923828
Epoch 5/10 | Loss: 335.6189676920573
Validation Loss: 141.2999725341797
Epoch 6/10 | Loss: 321.75197347005206
Validation Loss: 136.6573028564453
Epoch 7/10 | Loss: 309.0523376464844
Validation Loss: 132.35865783691406
Epoch 8/10 | Loss: 298.2560221354167
Validation Loss: 128.31820678710938
Epoch 9/10 | Loss: 289.38800048828125
Validation Loss: 125.27009582519531
Epoch 10/10 | Loss: 284.2415059407552
Validation Loss: 124.18313598632812


In [189]:
def preprocess_sentence(sentence):
    # Remove punctuations and convert to lowercase
    sentence = remove_punctuations(sentence).lower()
    # Convert to ASCII if needed
    sentence = convert_to_ascii(sentence)
    return sentence

def prepare_input(tokenizer, sentence, device):
    # Tokenize the sentence
    inputs = tokenizer.encode_plus(
        sentence,
        None,
        add_special_tokens=True,
        max_length=200,
        padding='max_length',
        return_token_type_ids=False,
        return_attention_mask=True,
        truncation=True
    )
    # Convert to tensors
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0).to(device)  # Add batch dimension
    attention_mask = torch.tensor(inputs['attention_mask']).unsqueeze(0).to(device)  # Add batch dimension
    return input_ids, attention_mask

def predict(model, tokenizer, sentence, device):
    model.eval()
    preprocessed_sentence = preprocess_sentence(sentence)
    input_ids, attention_mask = prepare_input(tokenizer, preprocessed_sentence, device)
    with torch.no_grad():
        # Get model logits
        logits = model(input_ids, attention_mask)
        # Decode the predictions using CRF layer
        predictions = model.crf.decode(logits)
    return predictions[0]  # Remove the batch dimension


In [190]:
def apply_diacritics(text, predictions):
    diacritics_map = {
        'u': {2: 'ü'},
        'o': {2: 'ö'},
        'i': {2: 'ı'},
        'g': {2: 'ğ'},
        's': {2: 'ş'},
        'c': {2: 'ç'}
    }
    result = []
    for i, char in enumerate(text):
        if char in diacritics_map and predictions[i] in diacritics_map[char]:
            print(char)
            # Apply the diacritic transformation
            result.append(diacritics_map[char][predictions[i]])
        else:
            # If no transformation needed, or character not in map, add original character
            result.append(char)
    return ''.join(result)

In [191]:
original_text = "Nasilsiniz acaba, her sey yolunda mi?"

# Get the prediction
predictions = predict(model, tokenizer, original_text, device)

decoded_prediction = apply_diacritics(original_text, predictions)

In [192]:
# Print the original and the predicted output
print("Original:", original_text)
print("Predicted:", decoded_prediction)  # You may need to map these predictions back to actual diacritics

Original: Nasilsiniz acaba, her sey yolunda mi?
Predicted: Nasilsiniz acaba, her sey yolunda mi?


In [193]:
# I -> İ durumu eklenecek