In [577]:
import numpy as np
import pandas as pd
import os
import string
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AdamW, BertModel, AutoConfig, AutoTokenizer
from torch.optim import Adam
import torch.nn as nn
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
import time;
import datetime
from sklearn.model_selection import train_test_split
from torchcrf import CRF

In [578]:
# !pip install pytorch-crf
# !pip install unidecode
# !pip install --upgrade transformers
# !pip install --upgrade torch

In [579]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [599]:
# nlp_train=pd.read_csv("/kaggle/input/nlp-project-train/train.csv", index_col=[0])
# nlp_test=pd.read_csv("/kaggle/input/nlp-project-train/test.csv",index_col=[0],encoding="windows-1252") 

nlp_train=pd.read_csv("train.csv", index_col=[0])
nlp_test=pd.read_csv("test.csv",index_col=[0],encoding="windows-1252") 
# nlp_train = nlp_train[:1000]

In [581]:
def convert_to_ascii(sentence):
    text = unidecode(sentence)
    return text

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
        text = text.replace('  ', ' ')
    return text.strip()

In [582]:
def split_sentences(sentences, max_length=100):
    punctuations = {'.', ',', ';', ':'}
    results = []

    for sentence in sentences:
        while len(sentence) > max_length:
            # find last punctuation before max_length
            split_pos = -1
            for p in punctuations:
                pos = sentence.rfind(p, 0, max_length + 1)
                if pos > split_pos:
                    split_pos = pos
            
            # If no punctuation found, split at the last space before max_length
            if split_pos == -1:
                split_pos = sentence.rfind(' ', 0, max_length + 1)
            
            # If no space found, just split at max_length
            if split_pos == -1:
                split_pos = max_length
            
            # Append the split segment to results
            results.append(sentence[:split_pos + 1].strip())
            # Move the rest of the sentence forward
            sentence = sentence[split_pos + 1:].strip()
        
        # Append the remainder of the sentence if it's not empty
        if sentence:
            results.append(sentence)
    
    return np.array(results)

In [583]:
def encode_word(sentence):
    words = sentence.split()
    encoded_sentence = []
    for word in words:
        encoded_word = []
        for char in word:
            # print(char)
            if char in "ıöüğşç":
                encoded_word.append(2)
            else:
                encoded_word.append(1)
            # print(encoded_word)
        encoded_sentence.append(encoded_word)
    return encoded_sentence

def padding(text, filling_char, max_length):
    if type(text) is str:
        text = text + filling_char * (max_length - len(text))
    elif type(text) is list:
        text = text + [filling_char] * (max_length - len(text))
    return text

In [584]:
import re

def remove_puncutations_and_numbers(text):
    # Iterate over the string and remove char if it is not a character
    characters = "abcçdefgğhıijklmnoöprsştuüvyzABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZxXwWqQ "
    for char in text:
        if char not in characters:
            text = text.replace(char, "")
    # Remove multiple spaces
    text = re.sub(' +', ' ', text)
    # Lowercase the text
    text = text.lower()
    return text

In [585]:
# Raw sentences
raw_sentences = nlp_train["Sentence"].values
# Split sentences
sentences = split_sentences(raw_sentences, max_length=200)
# Remove punctuations
# sentences = [remove_punctuations(s).lower() for s in sentences]
sentences = [remove_puncutations_and_numbers(i) for i in sentences]
nlp_train["Label"] = nlp_train["Sentence"]
# Apply convert to ascii to y_train
asci_sentences = [convert_to_ascii(s) for s in sentences]

In [586]:
# Convert sentences to list of characters
char_sentences_original = [list(s) for s in sentences]

# Apply map_diacritics to the numeric sentences
labels = [map_diacritics(sentence.copy()) for sentence in char_sentences_original]

char_sentences = [list(s) for s in asci_sentences]


In [587]:
characters_dictionary = {
    "a":1,
    "b":2,
    "c":3,
    "ç":4,
    "d":5,
    "e":6,
    "f":7,
    "g":8,
    "ğ":9,
    "h":10,
    "ı":11,
    "i":12,
    "j":13,
    "k":14,
    "l":15,
    "m":16,
    "n":17,
    "o":18,
    "ö":19,
    "p":20,
    "r":21,
    "s":22,
    "ş":23,
    "t":24,
    "u":25,
    "ü":26,
    "v":27,
    "y":28,
    "z":29,
    "q": 30,
    "x":31,
    "w":32,
    " ":33    
}

In [588]:
# Copy char_sentences to numeric_sentences
numeric_sentences = [sentence.copy() for sentence in char_sentences]
# Map each character to an integer in the sentences array to its value in the characters dictionary
for sentence_index in range(len(char_sentences)):
    for char_index in range(len(char_sentences[sentence_index])):
        # print(characters_dictionary[char_sentences[sentence_index][char_index]])
        numeric_sentences[sentence_index][char_index] = characters_dictionary[char_sentences[sentence_index][char_index]]

In [589]:
# Apply padding to the numeric sentences
max_length = 200
for sentence_index in range(len(numeric_sentences)):
    numeric_sentences[sentence_index] = padding(numeric_sentences[sentence_index], 0, max_length)

In [590]:
def map_diacritics(text):
    for char_index in range(len(text)):
        # print(text[char_index])
        if text[char_index] in "ıöüğşç":
            text[char_index] = 1
        else:
            text[char_index] = 0
    return text

In [591]:
# Apply padding to the labels
max_length = 200
for sentence_index in range(len(labels)):
    labels[sentence_index] = padding(labels[sentence_index], 0, max_length)

In [592]:
import torch
import torch.nn as nn
from torchcrf import CRF

class DiacritizationBiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
        super(DiacritizationBiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, sentences, labels=None):
        # Embedding layer
        x = self.embedding(sentences)
        
        # BiLSTM layer
        x, _ = self.lstm(x)
        x = self.dropout(x)
        
        # Fully connected layer
        emissions = self.fc(x)

        # CRF layer
        if labels is not None:
            # If labels are provided, calculate the loss
            loss = -self.crf(emissions, labels)
            return loss
        else:
            # Otherwise, return the best path
            prediction = self.crf.decode(emissions)
            return prediction

# Example of initializing the model
vocab_size = len(characters_dictionary) + 1  # Number of unique characters in your character dictionary + 1 for padding
embed_dim = 128
hidden_dim = 256
num_labels = 2  # Diacritic or not
model = DiacritizationBiLSTMCRF(vocab_size, embed_dim, hidden_dim, num_labels)
model.to(device)


DiacritizationBiLSTMCRF(
  (embedding): Embedding(34, 128)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (crf): CRF(num_tags=2)
)

In [593]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Assuming 'numeric_sentences' and 'labels' are your preprocessed datasets available as lists of lists
# Convert them into tensors
numeric_sentences_tensor = torch.tensor(numeric_sentences, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.long)

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(numeric_sentences_tensor, labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the loss function and the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training the model
def train_model(model, data_loader, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for sentences, label_seqs in data_loader:
            sentences, label_seqs = sentences.to(device), label_seqs.to(device)
            
            # Forward pass: Compute predicted y by passing x to the model
            loss = model(sentences, labels=label_seqs)
            
            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}")

# Start training
train_model(model, train_loader, optimizer, num_epochs=10)


Epoch 1, Loss: 1021.8725925990514
Epoch 2, Loss: 472.85624215262277
Epoch 3, Loss: 374.29310738699775
Epoch 4, Loss: 307.85654994419644
Epoch 5, Loss: 260.21475655691967
Epoch 6, Loss: 224.7394304547991
Epoch 7, Loss: 201.57762799944197
Epoch 8, Loss: 184.77154889787946
Epoch 9, Loss: 171.2187970842634
Epoch 10, Loss: 158.77383510044643


In [597]:
def preprocess_sentence(sentence, characters_dictionary, max_length=200):
    # Remove punctuation and numbers
    sentence = ''.join([i for i in sentence if not i.isdigit() and i not in string.punctuation])
    sentence = sentence.lower().strip()
    
    # Convert to numeric form using the character dictionary
    numeric_sentence = [characters_dictionary.get(char, 0) for char in sentence]  # default to 0 if char not found
    
    # Padding
    if len(numeric_sentence) > max_length:
        numeric_sentence = numeric_sentence[:max_length]
    else:
        numeric_sentence += [0] * (max_length - len(numeric_sentence))
    
    return torch.tensor([numeric_sentence], dtype=torch.long).to(device)

def predict(model, sentence, characters_dictionary):
    model.eval()
    with torch.no_grad():
        numeric_sentence = preprocess_sentence(sentence, characters_dictionary)
        prediction = model(numeric_sentence)
        return prediction[0]  # since we have a batch size of 1

# Example sentence
test_sentence = "Bu cumlede bazi Turkce karakterler var."

# Predict using the model
predicted_labels = predict(model, test_sentence, characters_dictionary)
print(predicted_labels)

# Define labels to diacritics mapping
def labels_to_diacritics(text_sentence, labels):
    diacritics_map_dict = {"i": "ı", "o": "ö", "u": "ü", "g": "ğ", "s": "ş", "c": "ç"}
    output_sentence = ""
    for i in range(len(text_sentence)):
        if labels[i] == 1:
            if text_sentence[i] in diacritics_map_dict:
                output_sentence += diacritics_map_dict[text_sentence[i]]
        else:
            output_sentence += text_sentence[i]
    return output_sentence

# Show the results
output_sentence = labels_to_diacritics(test_sentence, predicted_labels)
print("Original:", test_sentence)
print("Processed:", output_sentence)

[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Original: Bu cumlede bazi Turkce karakterler var.
Processed: Bu çümlede bazı Türkce karakterler var.
