In [1]:
from unidecode import unidecode
import numpy as np
import pandas as pd
import re

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from torchcrf import CRF
from tqdm import tqdm

from zemberek.morphology import TurkishMorphology
from zemberek.normalization import TurkishSpellChecker

from vnlp import Normalizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip install pytorch-crf
# !pip install unidecode
# !pip install --upgrade transformers
# !pip install --upgrade torch
# !pip install zemberek-python

In [3]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [4]:
nlp_train=pd.read_csv("train.csv", index_col=[0])
nlp_test=pd.read_csv("test.csv",index_col=[0],encoding="utf-8") 

#### Create Dictionaries to use on Data Process

In [5]:
# to lowerize the letters
lower_chars_dict = {
    "A": "a",
    "B": "b",
    "C": "c",
    "Ç": "ç",
    "D": "d",
    "E": "e",
    "F": "f",
    "G": "g",
    "Ğ": "ğ",
    "H": "h",
    "I": "ı",
    "İ": "i",
    "J": "j",
    "K": "k",
    "L": "l",
    "M": "m",
    "N": "n",
    "O": "o",
    "Ö": "ö",
    "P": "p",
    "R": "r",
    "S": "s",
    "Ş": "ş",
    "T": "t",
    "U": "u",
    "Ü": "ü",
    "V": "v",
    "Y": "y",
    "Z": "z",
    "Q": "q",
    "W": "w",
    "X": "x",
    "Â": "â",
    "Ê": "ê",
    "Î": "î",
    "Û": "û",
    "Ô": "ô"
    }
# to convert ascii format
words_dict = {
    "ı": "i",
    "ğ": "g",
    "ü": "u",
    "ş": "s",
    "ö": "o",
    "ç": "c",
    "İ": "I",
    "Ğ": "G",
    "Ü": "U",
    "Ş": "S",
    "Ö": "O",
    "Ç": "C"
    }

# to numericize the letters
characters_dictionary = {
    "a":1,
    "b":2,
    "c":3,
    "ç":4,
    "d":5,
    "e":6,
    "f":7,
    "g":8,
    "ğ":9,
    "h":10,
    "ı":11,
    "i":12,
    "j":13,
    "k":14,
    "l":15,
    "m":16,
    "n":17,
    "o":18,
    "ö":19,
    "p":20,
    "r":21,
    "s":22,
    "ş":23,
    "t":24,
    "u":25,
    "ü":26,
    "v":27,
    "y":28,
    "z":29,
    "q":30,
    "x":31,
    "w":32,
    " ":33,
    "â":34,
    "ê":35,
    "î":36,
    "û":37,
    "ô":38
    }

# to ckeck diacritic or non-diacritic versions of letters on zemberek process
diacritic_versions = {"i":"ı", "ı":"i", "o":"ö", "ö":"o", "u":"ü", "ü":"u", "g":"ğ", "ğ":"g", "s":"ş", "ş":"s", "c":"ç", "ç":"c", "I":"İ", "İ":"I", 
                      "O":"Ö", "Ö":"O", "U":"Ü", "Ü":"U", "G":"Ğ", "Ğ":"G", "S":"Ş", "Ş":"S", "C":"Ç", "Ç":"C", "â":"a", "a":"â", "ê":"e", "e":"ê",
                      "î":"ı", "ı":"î", "û":"u", "u":"û", "ô":"o", "o":"ô", "I":"Î", "Î":"I", "U":"Û", "Û":"U", "O":"Ô", "Ô":"O", "Â":"A", "A":"Â",
                      "Ê":"E", "E":"Ê"}



## PREPROCESS OF TRAIN DATA

In [6]:
def convert_to_ascii(sentence):
    # convert Turkish characters to English characters
    for key, value in words_dict.items():
        sentence = sentence.replace(key, value)
    return sentence

def remove_puncutations_and_numbers(text):
    # iterate over the string and remove char if it is not a character
    characters = "abcçdefgğhıijklmnoöprsştuüvyzABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZxXwWqQâêîôûÂÊÎÔÛ "
    for char in text:
        if char not in characters:
            text = text.replace(char, "")
    # remove multiple spaces
    text = re.sub(' +', ' ', text)
    return text

# split the sentences into smaller sentences
def split_sentences(sentences, max_length=99):
    punctuations = {'.', '?', '!', ';', ':', ','}
    results = []

    for sentence in sentences:
        while len(sentence) > max_length:
            # find last punctuation before max_length
            split_pos = -1
            for p in punctuations:
                pos = sentence.rfind(p, 0, max_length + 1)
                if pos > split_pos:
                    split_pos = pos
            
            # if no punctuation found, split at the last space before max_length
            if split_pos == -1:
                split_pos = sentence.rfind(' ', 0, max_length + 1)
            
            # if no space found, just split at max_length
            if split_pos == -1:
                split_pos = max_length
            
            # append the split segment to results
            results.append(sentence[:split_pos + 1].strip())
            # move the rest of the sentence forward
            sentence = sentence[split_pos + 1:].strip()
        
        # append the remainder of the sentence if it's not empty
        if sentence:
            results.append(sentence)
    
    return results

# padding function
def padding(text, filling_char, max_length):
    if type(text) is str:
        text = text + filling_char * (max_length - len(text))
    elif type(text) is list:
        text = text + [filling_char] * (max_length - len(text))
    return text

# map the diacritics to numbers
def map_diacritics(text):
    for char_index in range(len(text)):
        # print(text[char_index])
        if text[char_index] in "ıöüğşç":
            text[char_index] = 2
        elif text[char_index] in "aeiougsc":
            text[char_index] = 1
        elif text[char_index] == " ":
            text[char_index] = 0
        elif text[char_index] in "âêîôû":
            text[char_index] = 4
        else:
            text[char_index] = 3
        
    return text

def prepare_train_dataset(sentences):
    processed_sentences = []
    # iterate over the sentences
    for i, sentence in enumerate(sentences):
        # remove punctuations and numbers
        sentence = remove_puncutations_and_numbers(sentence)
        # split the sentence into smaller sentences
        splitted_sentences = split_sentences([sentence])
        new_list = []
        # iterate over the splitted sentences
        for s in splitted_sentences:
            # lowerize characters
            for key, value in lower_chars_dict.items():
                s = s.replace(key, value)
            new_list.append(s)
        processed_sentences.extend(new_list)
    
    diactrize_labels = processed_sentences.copy()
    # iterate over the sentences to diacritize them
    for i, sentence in enumerate(processed_sentences):
        diactrize_labels[i] = map_diacritics(list(sentence))
        # add padding to the diacritized sentence
        diactrize_labels[i] = padding(diactrize_labels[i], 0, 100)
    # asci sentences 
    asci_sentences = processed_sentences.copy()
    # iterate over the sentences to convert them to asci and map them to numbers
    for i, sentence in enumerate(processed_sentences):
        sentence = Normalizer.remove_accent_marks(sentence)
        asci_sentences[i] = convert_to_ascii(sentence)
        
    numeric_sentences = []
    for i, sentence in enumerate(asci_sentences):
        numeric_sentence = []
        for char in sentence:
            numeric_sentence.append(characters_dictionary[char])
        # add padding to the numeric sentence
        numeric_sentence = padding(numeric_sentence, 0, 100)
        numeric_sentences.append(numeric_sentence)
    return processed_sentences, diactrize_labels, asci_sentences, numeric_sentences

In [7]:
turkish_sentences = [
     "İmrali", "merhâbâ", "31 - giyim çeşidine göre standart ölçülere ilâve edilecek ölçü tablosu", "prof. dr. şükrü halûk akalın ve prof. dr. ali duymaz yönettikleri bölümlerin sonuç bildirilerini okudular."
 ]

In [8]:
# Raw sentences
raw_sentences = nlp_train["Sentence"].values

processed_turkish_sentences,labels,asci_sentences,numeric_sentences = prepare_train_dataset(raw_sentences.copy())

In [9]:
n=3
print(labels[n])
print(asci_sentences[n])
print(numeric_sentences[n])

[3, 1, 3, 2, 3, 1, 3, 0, 1, 3, 1, 2, 3, 2, 3, 3, 1, 3, 1, 3, 0, 2, 2, 3, 1, 3, 1, 1, 3, 1, 3, 1, 3, 0, 3, 1, 3, 1, 1, 3, 0, 3, 1, 3, 2, 2, 0, 3, 1, 3, 1, 3, 1, 3, 2, 3, 2, 0, 3, 1, 3, 1, 3, 3, 1, 3, 2, 3, 3, 1, 3, 0, 1, 1, 3, 3, 1, 0, 3, 1, 3, 1, 0, 3, 1, 3, 3, 1, 3, 1, 3, 1, 3, 1, 0, 0, 0, 0, 0, 0]
yapilan arastirmalar ogrencilerin mevcut dalis kurslarini tamamladiktan sonra bile kendilerini
[28, 1, 20, 12, 15, 1, 17, 33, 1, 21, 1, 22, 24, 12, 21, 16, 1, 15, 1, 21, 33, 18, 8, 21, 6, 17, 3, 12, 15, 6, 21, 12, 17, 33, 16, 6, 27, 3, 25, 24, 33, 5, 1, 15, 12, 22, 33, 14, 25, 21, 22, 15, 1, 21, 12, 17, 12, 33, 24, 1, 16, 1, 16, 15, 1, 5, 12, 14, 24, 1, 17, 33, 22, 18, 17, 21, 1, 33, 2, 12, 15, 6, 33, 14, 6, 17, 5, 12, 15, 6, 21, 12, 17, 12, 0, 0, 0, 0, 0, 0]


## MODEL

In [10]:
class DiacritizationBiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim_enc, hidden_dim_dec, num_labels, num_heads, num_layers):
        super(DiacritizationBiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_layer, num_layers=num_layers)
        self.lstm_enc = nn.LSTM(embed_dim, hidden_dim_enc // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.lstm_dec = nn.LSTM(hidden_dim_enc, hidden_dim_dec, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_dim_dec * 2, num_labels) 
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, sentences, labels=None):
        x = self.embedding(sentences)
        
        # Transformer layer
        x = x.permute(1, 0, 2) 
        x = self.transformer_encoder(x)
        x = x.permute(1, 0, 2) 
        
        # BiLSTM encoder layer
        enc_output, _ = self.lstm_enc(x)
        
        # Decoder LSTM
        dec_outputs, _ = self.lstm_dec(enc_output)
        
        combined_outputs = torch.cat((enc_output, dec_outputs), dim=2)
        
        emissions = self.fc(combined_outputs)
        
        # CRF layer
        if labels is not None:
            # if labels are provided, calculate the loss
            loss = -self.crf(emissions, labels)
            return loss
        else:
            # otherwise, return the best path
            prediction = self.crf.decode(emissions)
            return prediction

vocab_size = len(characters_dictionary) + 1 
embed_dim = 128
hidden_dim_enc = 256
hidden_dim_dec = 256
num_heads = 8
num_labels = 5
num_layers = 5

model = DiacritizationBiLSTMCRF(vocab_size, embed_dim, hidden_dim_enc, hidden_dim_dec, num_labels, num_heads, num_layers)
model.to(device)



DiacritizationBiLSTMCRF(
  (embedding): Embedding(39, 128)
  (transformer_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
    (linear1): Linear(in_features=128, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=128, bias=True)
    (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-4): 5 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
    

## TRAINING

In [11]:
numeric_sentences_tensor = torch.tensor(numeric_sentences, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.long)

# Create TensorDataset and DataLoader for training and validation
train_dataset = TensorDataset(numeric_sentences_tensor, labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)

In [12]:
# raining the model
def train_model(model, train_loader, optimizer, num_epochs=100):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        pbar = tqdm(train_loader, total=len(train_loader), leave=False)
        for sentences, label_seqs in pbar:
            sentences, label_seqs = sentences.to(device), label_seqs.to(device)
            
            # Forward pass: Compute predicted y by passing x to the model
            loss = model(sentences, labels=label_seqs)
            
            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            pbar.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(pbar):.4f}")
        
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {total_loss / len(train_loader):.4f}")
        
        torch.save(model.state_dict(), f'/home/oeren/Documents/YZV-NLP/weights_final2/model_epoch_{epoch+1}.pth')

In [13]:
# Start training
#train_model(model, train_loader, optimizer, num_epochs=50)

## POST PROCESS

In [14]:
# to improve the results with ZEMBEREK
def check_morphology(sentences):
    morphology = TurkishMorphology.create_with_defaults()
    morphology.ignoreDiacriticsInAnalysis = False
    spellChecker = TurkishSpellChecker(morphology)

    corrected_sentences = sentences.copy()

    # Iterate through each sentence
    for i in range(len(corrected_sentences)):
        # split the sentence into words and iterate through each word
        words = corrected_sentences[i].split()
        for m in range(len(words)):
            if "\u00B0" in words[m] or len(words[m]) == 1:
                continue
            # analyze the word
            analysis = morphology.analyze(words[m])

            # check if the word has no analysis results
            if len(analysis.analysis_results) == 0:
                print("No analysis results for:", words[m])
                
                # check if there are suggestions from the spell checker
                suggestions = spellChecker.suggest_for_word(words[m])
                if len(suggestions) != 0:
                    for suggested_word in suggestions:
                        print("Suggested word:", suggested_word)                  
                        # if the suggested word has the same length as the original word, and just "ıioöuügğsşcçâêîôûae" characters are different, replace the word
                        if len(suggested_word) == len(words[m]):
                            for char1, char2 in zip(suggested_word, words[m]):
                                if char1 in diacritic_versions and char2 == diacritic_versions[char1]:
                                    continue
                                if char1 in diacritic_versions and char2 == char1:
                                    continue
                                if char1 not in diacritic_versions and char1 == char2:
                                    continue
                                elif char1 in diacritic_versions and char2 != diacritic_versions[char1] and char2 != char1:
                                    break
                                elif char1 not in diacritic_versions and char1 != char2:
                                    break
                            else:
                                print("Suggested word is approved:", suggested_word)
                                words[m] = suggested_word
                                break

        # join the modified words back into a sentence
        corrected_sentences[i] = ' '.join(words)

    return corrected_sentences

In [15]:
# convert predictions to sentences
def predict_test_sentence(sentence, model):
    # strip the sentence
    sentence = sentence.strip()
    original_sentence = sentence
    sentence = remove_puncutations_and_numbers(sentence)    
    # lowerize characters
    for key, value in lower_chars_dict.items():
        sentence = sentence.replace(key, value)
    
    # convert to asci
    sentence = Normalizer.remove_accent_marks(sentence)
    sentence = convert_to_ascii(sentence)
    
    sentences_array = []
    # split sentence into smaller sentences by using split_sentences function
    sentences_array.extend(split_sentences([sentence]))
    # map to numbers
    numeric_sentences = []
    for sentence in sentences_array:
        numeric_sentence = []
        # print(len(sentence))
        for char in sentence:
            numeric_sentence.append(characters_dictionary[char])
        # add padding
        numeric_sentence = padding(numeric_sentence, 0, 100)
        numeric_sentences.append(numeric_sentence)
    
    # make predictions using the model
    model.eval()
    predictions = []
    for numeric_sentence in numeric_sentences:
        numeric_sentence = torch.tensor([numeric_sentence], dtype=torch.long).to(device)
        prediction = model(numeric_sentence)
        predictions.extend(prediction[0])
    # print(predictions)
    # merge predictions to single list and remove 0s from the list
    clipped_predictions = [x for x in predictions if x != 0]
    # print(len(predictions))
    diacritics_map_dict = {"i": "ı", "o": "ö", "u": "ü", "g": "ğ", "s": "ş", "c": "ç", "I": "İ", "O": "Ö", "U": "Ü", "G": "Ğ", "S": "Ş", "C": "Ç"}
    diacritic_map_accent = {"ı": "î", "a": "â", "e": "ê", "o": "ô", "u": "û", "I": "Î", "A": "Â", "E": "Ê", "O": "Ô", "U": "Û"}
    # iterate over the original sentence
    output_sentence = ""
    predictions_index = 0
    for ind,char in enumerate(original_sentence):
        if (char in characters_dictionary  or char in lower_chars_dict) and char != " ":
            # print("char:", char, "prediction:", clipped_predictions[predictions_index])
            if clipped_predictions[predictions_index] == 2:
                if char in diacritics_map_dict:
                    if char == "I":
                        output_sentence += "I"
                    else:
                        output_sentence += diacritics_map_dict[char]
                else:
                    output_sentence += char
            elif clipped_predictions[predictions_index] == 1:
                if char == "I":
                    output_sentence += "İ"
                else:
                    output_sentence += char
            elif clipped_predictions[predictions_index] == 4:
                if char in diacritic_map_accent:
                    output_sentence += diacritic_map_accent[char]
                else:
                    output_sentence += char
            elif clipped_predictions[predictions_index] == 3:
                output_sentence += char
            predictions_index += 1
            
        else:
            output_sentence += char

    return output_sentence

In [16]:
uzun_cumle = "Iyi misin Mit gorusmesi ihtiyac duyuldukca oluyor"
new_sentence = predict_test_sentence(uzun_cumle,model)
print(new_sentence)

Îyi misin Mit gôrûsmêsi ihtiyâc dûyûldûkcâ ôlûyôr


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


In [20]:
model = DiacritizationBiLSTMCRF(vocab_size, embed_dim, hidden_dim_enc, hidden_dim_dec, num_labels, num_heads, num_layers)
model.to(device)
# Load the saved weights
model.load_state_dict(torch.load('best.pth', map_location=device))

<All keys matched successfully>

In [21]:
# make predictions on the test set
test_sentences = nlp_test["Sentence"].values
# iterate over the test sentences and make predictions
predicted_sentences = []
for sentence in test_sentences:
    predicted_sentence = predict_test_sentence(sentence, model)
    predicted_sentences.append(predicted_sentence)

In [None]:
zemberek_predicted_sentences = check_morphology(predicted_sentences)

## SAVE THE RESULT

In [23]:
# save the predictions to a CSV file, it will have two columns: "ID" and "Sentence"
output_df = pd.DataFrame({"ID": nlp_test.index, "Sentence": zemberek_predicted_sentences})

In [24]:
# # save the dataframe to a CSV file
# output_df.to_csv("predictions4.csv", index=False)