In [203]:
import string
from unidecode import unidecode
import time;
import datetime
import numpy as np
import pandas as pd
import os
import re

import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AdamW, BertModel, AutoConfig, AutoTokenizer
from torch.optim import Adam
import torch.nn as nn
from torchcrf import CRF
from transformers import AutoTokenizer, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split

import zemberek
from zemberek.morphology import TurkishMorphology
from zemberek.morphology.analysis.rule_based_analyzer import RuleBasedAnalyzer
from zemberek.normalization import TurkishSpellChecker
from zemberek.morphology.morphotactics import TurkishMorphotactics, InformalTurkishMorphotactics
#import RootLexicon
from zemberek.morphology.lexicon import RootLexicon

In [204]:
# !pip install pytorch-crf
# !pip install unidecode
# !pip install --upgrade transformers
# !pip install --upgrade torch
# !pip install zemberek-python

In [205]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [206]:
# nlp_train=pd.read_csv("/kaggle/input/nlp-project-train/train.csv", index_col=[0])
# nlp_test=pd.read_csv("/kaggle/input/nlp-project-train/test.csv",index_col=[0],encoding="windows-1252") 

nlp_train=pd.read_csv("train.csv", index_col=[0])
nlp_test=pd.read_csv("test.csv",index_col=[0],encoding="utf-8") 

nlp_train = nlp_train.iloc[:2]

In [207]:
lower_chars_dict = {
    "A": "a",
    "B": "b",
    "C": "c",
    "Ç": "ç",
    "D": "d",
    "E": "e",
    "F": "f",
    "G": "g",
    "Ğ": "ğ",
    "H": "h",
    "I": "ı",
    "İ": "i",
    "J": "j",
    "K": "k",
    "L": "l",
    "M": "m",
    "N": "n",
    "O": "o",
    "Ö": "ö",
    "P": "p",
    "R": "r",
    "S": "s",
    "Ş": "ş",
    "T": "t",
    "U": "u",
    "Ü": "ü",
    "V": "v",
    "Y": "y",
    "Z": "z",
    "Q": "q",
    "W": "w",
    "X": "x"    
    }

words_dict = {
    "ı": "i",
    "ğ": "g",
    "ü": "u",
    "ş": "s",
    "ö": "o",
    "ç": "c",
    "İ": "I",
    "Ğ": "G",
    "Ü": "U",
    "Ş": "S",
    "Ö": "O",
    "Ç": "C"
    }

characters_dictionary = {
    "a":1,
    "b":2,
    "c":3,
    "ç":4,
    "d":5,
    "e":6,
    "f":7,
    "g":8,
    "ğ":9,
    "h":10,
    "ı":11,
    "i":12,
    "j":13,
    "k":14,
    "l":15,
    "m":16,
    "n":17,
    "o":18,
    "ö":19,
    "p":20,
    "r":21,
    "s":22,
    "ş":23,
    "t":24,
    "u":25,
    "ü":26,
    "v":27,
    "y":28,
    "z":29,
    "q": 30,
    "x":31,
    "w":32,
    " ":33,
    }

diacritic_versions = {"i":"ı", "ı":"i", "o":"ö", "ö":"o", "u":"ü", "ü":"u", "g":"ğ", "ğ":"g", "s":"ş", "ş":"s", "c":"ç", "ç":"c"}

    


## Prepare train data

In [208]:
def convert_to_ascii(sentence):
    # convert Turkish characters to English characters
    for key, value in words_dict.items():
        sentence = sentence.replace(key, value)
    return sentence

def remove_puncutations_and_numbers(text):
    # Iterate over the string and remove char if it is not a character
    characters = "abcçdefgğhıijklmnoöprsştuüvyzABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZxXwWqQ "
    for char in text:
        if char not in characters:
            text = text.replace(char, "")
    # Remove multiple spaces
    text = re.sub(' +', ' ', text)
    # Lowercase the text
    # text = text.lower()
    return text

def split_sentences(sentences, max_length=100):
    punctuations = {'.', ',', ';', ':'}
    results = []

    for sentence in sentences:
        while len(sentence) > max_length:
            # find last punctuation before max_length
            split_pos = -1
            for p in punctuations:
                pos = sentence.rfind(p, 0, max_length + 1)
                if pos > split_pos:
                    split_pos = pos
            
            # If no punctuation found, split at the last space before max_length
            if split_pos == -1:
                split_pos = sentence.rfind(' ', 0, max_length + 1)
            
            # If no space found, just split at max_length
            if split_pos == -1:
                split_pos = max_length
            
            # Append the split segment to results
            results.append(sentence[:split_pos + 1].strip())
            # Move the rest of the sentence forward
            sentence = sentence[split_pos + 1:].strip()
        
        # Append the remainder of the sentence if it's not empty
        if sentence:
            results.append(sentence)
    
    return results

def padding(text, filling_char, max_length):
    if type(text) is str:
        text = text + filling_char * (max_length - len(text))
    elif type(text) is list:
        text = text + [filling_char] * (max_length - len(text))
    return text

def map_diacritics(text):
    for char_index in range(len(text)):
        # print(text[char_index])
        if text[char_index] in "ıöüğşç":
            text[char_index] = 2
        elif text[char_index] in "iougsc":
            text[char_index] = 1
        elif text[char_index] == " ":
            text[char_index] = 0
        else:
            text[char_index] = 3
        
    return text

def prepare_train_dataset(sentences):
    # print(sentences)
    processed_sentences = []
    # Iterate over the sentences
    for i, sentence in enumerate(sentences):
        # Remove punctuations and numbers
        sentence = remove_puncutations_and_numbers(sentence)
        # Split the sentence into smaller sentences
        splitted_sentences = split_sentences([sentence])
        # print(splitted_sentences)
        # break
        new_list = []
        # Iterate over the splitted sentences
        for s in splitted_sentences:
            # Lowerize characters
            for key, value in lower_chars_dict.items():
                s = s.replace(key, value)
            new_list.append(s)
        processed_sentences.extend(new_list)
    # print(processed_sentences)
    
    diactrize_labels = processed_sentences.copy()
    # Iterate over the sentences to diacritize them
    for i, sentence in enumerate(processed_sentences):
        diactrize_labels[i] = map_diacritics(list(sentence))
        # Add padding to the diacritized sentence
        diactrize_labels[i] = padding(diactrize_labels[i], 0, 100)
    # Asci sentences 
    asci_sentences = processed_sentences.copy()
    # Iterate over the sentences to convert them to asci and map them to numbers
    for i, sentence in enumerate(processed_sentences):
        asci_sentences[i] = convert_to_ascii(sentence)
        
    numeric_sentences = []
    for i, sentence in enumerate(asci_sentences):
        numeric_sentence = []
        for char in sentence:
            numeric_sentence.append(characters_dictionary[char])
        # Add padding to the numeric sentence
        numeric_sentence = padding(numeric_sentence, 0, 100)
        numeric_sentences.append(numeric_sentence)
    return processed_sentences, diactrize_labels, asci_sentences, numeric_sentences

In [209]:
# turkish_sentences = [
#     "Imrali",
# ]

In [210]:
# Raw sentences
raw_sentences = nlp_train["Sentence"].values

processed_turkish_sentences,labels,asci_sentences,numeric_sentences = prepare_train_dataset(raw_sentences.copy())

In [211]:
print(labels)

[[1, 2, 3, 2, 3, 0, 3, 3, 3, 1, 3, 0, 3, 3, 0, 3, 2, 2, 3, 0, 3, 3, 3, 1, 3, 0, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 0, 3, 2, 3, 0, 3, 2, 3, 3, 3, 3, 3, 0, 1, 3, 2, 3, 3, 3, 1, 0, 3, 3, 2, 3, 3, 2, 3, 2, 0, 3, 1, 3, 0, 1, 3, 3, 3, 3, 3, 3, 3, 0, 1, 3, 1, 2, 3, 1, 3, 3, 1, 2, 3, 1, 3, 0, 0, 0, 0], [3, 1, 0, 1, 3, 3, 3, 3, 3, 3, 3, 0, 1, 1, 3, 3, 3, 2, 3, 3, 3, 0, 3, 3, 3, 3, 1, 3, 1, 0, 3, 3, 3, 3, 3, 0, 3, 1, 1, 1, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 1, 1, 3, 0, 1, 3, 1, 3, 3, 1, 3, 0, 3, 3, 3, 2, 1, 2, 0, 1, 3, 3, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 0, 3, 1, 3, 1, 3, 0, 1, 2, 1, 3, 0, 3, 1, 3, 1, 3, 1], [1, 2, 2, 0, 3, 3, 0, 1, 1, 3, 0, 1, 3, 1, 0, 3, 2, 3, 3, 3, 0, 3, 1, 3, 3, 3, 3, 1, 3, 0, 2, 3, 3, 3, 0, 1, 3, 3, 3, 3, 2, 2, 2, 3, 2, 0, 1, 3, 3, 1, 3, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [212]:
class DiacritizationBiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim_enc, hidden_dim_dec, num_labels, num_heads, num_layers):
        super(DiacritizationBiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_layer, num_layers=num_layers)
        self.lstm_enc = nn.LSTM(embed_dim, hidden_dim_enc // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.lstm_dec = nn.LSTM(hidden_dim_enc, hidden_dim_dec, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_dim_dec * 2, num_labels)  # Concatenate bidirectional hidden states for CRF input
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, sentences, labels=None):
        # Embedding layer
        x = self.embedding(sentences)
        
        # Transformer layer
        x = x.permute(1, 0, 2)  # Change from [batch_size, seq_len, embed_dim] to [seq_len, batch_size, embed_dim]
        x = self.transformer_encoder(x)
        x = x.permute(1, 0, 2)  # Change back to [batch_size, seq_len, embed_dim]
        
        # BiLSTM encoder layer
        enc_output, _ = self.lstm_enc(x)
        
        # Decoder LSTM
        dec_outputs, _ = self.lstm_dec(enc_output)
        
        # Concatenate bidirectional hidden states for CRF input
        combined_outputs = torch.cat((enc_output, dec_outputs), dim=2)
        
        # Fully connected layer
        emissions = self.fc(combined_outputs)
        
        # CRF layer
        if labels is not None:
            # If labels are provided, calculate the loss
            loss = -self.crf(emissions, labels)
            return loss
        else:
            # Otherwise, return the best path
            prediction = self.crf.decode(emissions)
            return prediction

# Example of initializing the model
vocab_size = len(characters_dictionary) + 1  # Number of unique characters in your character dictionary + 1 for padding
embed_dim = 128
hidden_dim_enc = 256
hidden_dim_dec = 256
num_labels = 4  # Diacritic or not
num_heads = 8
num_layers = 3

model = DiacritizationBiLSTMCRF(vocab_size, embed_dim, hidden_dim_enc, hidden_dim_dec, num_labels, num_heads, num_layers)



In [213]:
numeric_sentences_tensor = torch.tensor(numeric_sentences, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.long)

# Splitting three variables: numeric_sentences_tensor, labels_tensor, and another_tensor
X_train, X_val, y_train, y_val, sentences_train, sentences_val = train_test_split(
    numeric_sentences_tensor, labels_tensor, asci_sentences, test_size=0.2, random_state=42)

# Create TensorDataset and DataLoader for training and validation
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

# Define the loss function and the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [214]:
# Training the model
def train_model(model, data_loader, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for sentences, label_seqs in data_loader:
            sentences, label_seqs = sentences.to(device), label_seqs.to(device)
            
            # Forward pass: Compute predicted y by passing x to the model
            loss = model(sentences, labels=label_seqs)
            
            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}")

In [215]:
# Start training
#train_model(model, train_loader, optimizer, num_epochs=10)

In [216]:
def check_morphology(sentences):
    morphology = TurkishMorphology.create_with_defaults()
    morphology.ignoreDiacriticsInAnalysis = False
    spellChecker = TurkishSpellChecker(morphology)

    corrected_sentences = sentences.copy()

    # Iterate through each sentence
    for i in range(len(corrected_sentences)):
        # Split the sentence into words and iterate through each word
        words = corrected_sentences[i].split()
        for m in range(len(words)):
            if "\u00B0" in words[m] or len(words[m]) == 1:
                continue
            analysis = morphology.analyze(words[m])

            # Check if the word has no analysis results
            if len(analysis.analysis_results) == 0:
                print("No analysis results for:", words[m])
                
                # Check if there are suggestions from the spell checker
                suggestions = spellChecker.suggest_for_word(words[m])
                if len(suggestions) != 0:
                    for suggested_word in suggestions:
                        print("Suggested word:", suggested_word)                  
                        # if the suggested word has the same length as the original word, and just "ıioöuügğsşcç" characters are different, replace the word
                        if len(suggested_word) == len(words[m]):
                            for char1, char2 in zip(suggested_word, words[m]):
                                if char1 in diacritic_versions and char2 == diacritic_versions[char1]:
                                    continue
                                if char1 in diacritic_versions and char2 == char1:
                                    continue
                                if char1 not in diacritic_versions and char1 == char2:
                                    continue
                                elif char1 in diacritic_versions and char2 != diacritic_versions[char1] and char2 != char1:
                                    break
                                elif char1 not in diacritic_versions and char1 != char2:
                                    break
                            else:
                                print("Suggested word is approved:", suggested_word)
                                words[m] = suggested_word
                                break

        # Join the modified words back into a sentence
        corrected_sentences[i] = ' '.join(words)

    return corrected_sentences

In [217]:
def predict_test_sentence(sentence, model):
    # Strip the sentence
    sentence = sentence.strip()
    original_sentence = sentence
    # print(f"Original sentence: '{original_sentence}'",)
    sentence = remove_puncutations_and_numbers(sentence)    
    # Lowerize characters
    for key, value in lower_chars_dict.items():
        sentence = sentence.replace(key, value)
    
    # Convert to asci
    sentence = convert_to_ascii(sentence)
    
    sentences_array = []
    # Split sentence into smaller sentences by using split_sentences function
    sentences_array.extend(split_sentences([sentence]))
    # Map to numbers
    numeric_sentences = []
    for sentence in sentences_array:
        numeric_sentence = []
        # print(len(sentence))
        for char in sentence:
            numeric_sentence.append(characters_dictionary[char])
        # Add padding
        numeric_sentence = padding(numeric_sentence, 0, 100)
        numeric_sentences.append(numeric_sentence)
    
    # Make predictions using the model
    model.eval()
    predictions = []
    for numeric_sentence in numeric_sentences:
        numeric_sentence = torch.tensor([numeric_sentence], dtype=torch.long)
        prediction = model(numeric_sentence)
        predictions.extend(prediction[0])
    # print(predictions)
    # Merge predictions to single list and remove 0s from the list
    clipped_predictions = [x for x in predictions if x != 0]
    # print(len(predictions))
    diacritics_map_dict = {"i": "ı", "o": "ö", "u": "ü", "g": "ğ", "s": "ş", "c": "ç", "I": "İ", "O": "Ö", "U": "Ü", "G": "Ğ", "S": "Ş", "C": "Ç"}
    # Iterate over the original sentence
    output_sentence = ""
    predictions_index = 0
    for ind,char in enumerate(original_sentence):
        if (char in characters_dictionary  or char in lower_chars_dict) and char != " ":
            # print("char:", char, "prediction:", clipped_predictions[predictions_index])
            if clipped_predictions[predictions_index] == 2:
                if char in diacritics_map_dict:
                    output_sentence += diacritics_map_dict[char]
                else:
                    output_sentence += char
            elif clipped_predictions[predictions_index] == 1:
                if char == "I":
                    output_sentence += "İ"
                else:
                    output_sentence += char
            elif clipped_predictions[predictions_index] == 3:
                output_sentence += char
            predictions_index += 1
            
        else:
            output_sentence += char
            # output_sentence += char
    # 
    print(predictions)
    return output_sentence

In [218]:
uzun_cumle = "Iyi misin Mit gorusmesi ihtiyac duyuldukca oluyor"
new_sentence = predict_test_sentence(uzun_cumle,model)
# print(new_sentence)

[2, 2, 1, 3, 2, 2, 2, 2, 1, 3, 2, 2, 1, 3, 1, 3, 1, 3, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 3, 2, 3, 2, 2, 2, 1, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1]


In [219]:
model = DiacritizationBiLSTMCRF(vocab_size, embed_dim, hidden_dim_enc, hidden_dim_dec, num_labels, num_heads, num_layers)
model.to(device)
# Load the saved weights
model.load_state_dict(torch.load('/Users/mustafa/Desktop/Courses/Natural Language Processing/Project/Code/YZV405_2324_150200326_150210339/model_epoch_18.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [220]:
# Make predictions on the test set
test_sentences = nlp_test["Sentence"].values
# Iterate over the test sentences and make predictions
predicted_sentences = []
for sentence in test_sentences:
    predicted_sentence = predict_test_sentence(sentence, model)
    predicted_sentences.append(predicted_sentence)

[3, 3, 0, 3, 3, 1, 3, 1, 3, 1, 0, 3, 3, 0, 3, 1, 3, 1, 3, 1, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 1, 0, 3, 2, 3, 3, 1, 3, 3, 0, 3, 1, 3, 0, 3, 3, 0, 1, 3, 1, 1, 3, 0, 1, 3, 3, 3, 3, 3, 1, 1, 0, 3, 3, 3, 1, 3, 3, 3, 0, 3, 3, 0, 2, 3, 3, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2, 3, 3, 0, 1, 1, 3, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 3, 0, 1, 2, 3, 1, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 3, 3, 3, 2, 0, 3, 1, 3, 0, 1, 2, 3, 2, 2, 3, 3, 1, 1, 0, 1, 3, 3, 1, 3, 3, 2, 0, 3, 1, 3, 

In [221]:
zemberek_predicted_sentences = check_morphology(predicted_sentences)

2024-05-07 02:06:54,380 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 4.118849754333496

No analysis results for: nin
Suggested word: bin
Suggested word: çin
Suggested word: in
Suggested word: din
Suggested word: kin
Suggested word: ni
Suggested word: nün
Suggested word: nil
Suggested word: cin
Suggested word: nine
Suggested word: inin
Suggested word: nino
Suggested word: niş
Suggested word: nina
Suggested word: fin
Suggested word: sin
Suggested word: fin
Suggested word: min
Suggested word: tin
Suggested word: pin
Suggested word: jin
Suggested word: win
Suggested word: pnin
Suggested word: pnin
Suggested word: enin
Suggested word: ynin
Suggested word: vnin
Suggested word: znin
Suggested word: mnin
Suggested word: snin
Suggested word: fnin
Suggested word: zin
Suggested word: cnin
Suggested word: bnin
Suggested word: anin
Suggested word: dnin
Suggested word: rnin
Suggested word: şnin
Suggested word: tnin
Suggested word: nim
Suggested word:

In [222]:
# for i in range(len(predicted_results)):
#     # Check if length of the predicted sentence is different than the original sentence
#     if len(predicted_results[i]) != len(test_sentences[i]):
#         print("Original:", test_sentences[i])
#         print("Predicted:", predicted_results[i])
#         print()

In [223]:
# Save the predictions to a CSV file, it will have two columns: "ID" and "Sentence"
output_df = pd.DataFrame({"ID": nlp_test.index, "Sentence": zemberek_predicted_sentences})

In [224]:
# Save the dataframe to a CSV file
output_df.to_csv("predictions.csv", index=False)