<a href="https://colab.research.google.com/github/mohamednabilabdelfattah/AVR-drivers/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import pickle
import numpy as np
import re
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(device)

cuda


In [4]:
# load data
with open('dataset/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_diacritized = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open('dataset/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open('dataset/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_diacritized = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open('dataset/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data = re.compile(r'[\n\r\t\s]').sub('', file.read())

In [5]:
# define the diacritics unicode and their corresponding labels classes indices
# note that index 0 is reserved for no diacritic
labels = {
    # no diacritic
    0: 0,
    # fath
    1614: 1,
    # damm
    1615: 2,
    # kasr
    1616: 3,
    # shadd
    1617: 4,
    # sukun
    1618: 5,
    # tanween bel fath
    1611: 6,
    # tanween bel damm
    1612: 7,
    # tanween bel kasr
    1613: 8,
    # shadd and fath
    (1617, 1614): 9,
    # shadd and damm
    (1617, 1615): 10,
    # shadd and kasr
    (1617, 1616): 11,
    # shadd and tanween bel fath
    (1617, 1611): 12,
    # shadd and tanween bel damm
    (1617, 1612): 13,
    # shadd and tanween bel kasr
    (1617, 1613): 14
}

indicies_to_labels = {
    # no diacritic
    0: 0,
    # fath
    1: 1614,
    # damm
    2: 1615,
    # kasr
    3: 1616,
    # shadd
    4: 1617,
    # sukun
    5: 1618,
    # tanween bel fath
    6: 1611,
    # tanween bel damm
    7: 1612,
    # tanween bel kasr
    8: 1613,
    # shadd and fath
    9: (1617, 1614),
    # shadd and damm
    10: (1617, 1615),
    # shadd and kasr
    11: (1617, 1616),
    # shadd and tanween bel fath
    12: (1617, 1611),
    # shadd and tanween bel damm
    13: (1617, 1612),
    # shadd and tanween bel kasr
    14: (1617, 1613)
}


In [6]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))
diacritization = list(labels.keys())

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

print(char_to_index)

def text_to_sequence(text):
    return [char_to_index[char] for char in text]

train_sequence = text_to_sequence(training_data)
validation_sequences = text_to_sequence(validation_data)

print("Number of unique characters: ", len(unique_chars))
print(unique_chars)
print(train_sequence[:10])

{'ى': 0, 'ر': 1, 'ا': 2, 'ث': 3, 'ي': 4, 'ذ': 5, 'ت': 6, 'ئ': 7, 'ط': 8, 'غ': 9, 'ء': 10, 'ؤ': 11, 'ز': 12, 'ظ': 13, 'ك': 14, 'آ': 15, 'ة': 16, 'و': 17, 'ص': 18, 'د': 19, 'أ': 20, 'ن': 21, 'ج': 22, 'ق': 23, 'م': 24, 'ل': 25, 'ع': 26, 'خ': 27, 'س': 28, 'إ': 29, 'ب': 30, 'ه': 31, 'ش': 32, 'ح': 33, 'ض': 34, 'ف': 35}
Number of unique characters:  36
{'ى', 'ر', 'ا', 'ث', 'ي', 'ذ', 'ت', 'ئ', 'ط', 'غ', 'ء', 'ؤ', 'ز', 'ظ', 'ك', 'آ', 'ة', 'و', 'ص', 'د', 'أ', 'ن', 'ج', 'ق', 'م', 'ل', 'ع', 'خ', 'س', 'إ', 'ب', 'ه', 'ش', 'ح', 'ض', 'ف'}
[23, 17, 25, 31, 20, 17, 23, 8, 26, 2]


In [None]:
training_data_labels = []
training_size = len(training_data_diacritized)
index = 0
while index < training_size:
    if ord(training_data_diacritized[index]) not in labels:
        # char is not a diacritic
        if (index + 1) < training_size and ord(training_data_diacritized[index + 1]) in labels:
            # char has a diacritic
            if ord(training_data_diacritized[index + 1]) == 1617:
                # char has a shadd diacritic
                if (index + 2) < training_size and ord(training_data_diacritized[index + 2]) in labels:
                    # char has a shadd and another diacritic
                    training_data_labels.append(labels[(1617, ord(training_data_diacritized[index + 2]))])
                    # skip next 2 diacritics chars
                    index += 3  # increment by 3 to skip two diacritic chars
                    continue
                else:
                    # char has a shadd and no other diacritic
                    training_data_labels.append(labels[1617])
                    # skip next diacritic char
                    index += 2
                    continue
            # char has a diacritic other than shadd
            training_data_labels.append(labels[ord(training_data_diacritized[index + 1])])
            # skip next diacritic char
            index += 2  # increment by 2 to skip one diacritic char
            continue
        else:
            # char has no diacritic
            training_data_labels.append(0)
    index += 1  # increment by 1 for normal iteration


In [None]:
print(len(training_data_labels))
print(len(train_sequence))

In [None]:
# Create the training sequences
sequence_length = 5  # Choose an appropriate sequence length
X = [train_sequence[i:i+sequence_length] for i in range(len(train_sequence)-sequence_length+1)]
y = training_data_labels[sequence_length-1:]
X = torch.tensor(X)
y = torch.tensor(y)
print(X.shape)
print(y.shape)




In [None]:
# create the validation sequences
validation_data_labels = []
validation_size = len(validation_data_diacritized)
index = 0
while index < validation_size:
    if ord(validation_data_diacritized[index]) not in labels:
        # char is not a diacritic
        if (index + 1) < validation_size and ord(validation_data_diacritized[index + 1]) in labels:
            # char has a diacritic
            if ord(validation_data_diacritized[index + 1]) == 1617:
                # char has a shadd diacritic
                if (index + 2) < validation_size and ord(validation_data_diacritized[index + 2]) in labels:
                    # char has a shadd and another diacritic
                    validation_data_labels.append(labels[(1617, ord(validation_data_diacritized[index + 2]))])
                    # skip next 2 diacritics chars
                    index += 3  # increment by 3 to skip two diacritic chars
                    continue
                else:
                    # char has a shadd and no other diacritic
                    validation_data_labels.append(labels[1617])
                    # skip next diacritic char
                    index += 2
                    continue
            # char has a diacritic other than shadd
            validation_data_labels.append(labels[ord(validation_data_diacritized[index + 1])])
            # skip next diacritic char
            index += 2  # increment by 2 to skip one diacritic char
            continue
        else:
            # char has no diacritic
            validation_data_labels.append(0)
    index += 1  # increment by 1 for normal iteration


In [None]:
print(len(validation_data_labels))
print(len(validation_sequences))

In [None]:
# Create the validation sequences
sequence_length = 5  # Choose an appropriate sequence length
X_val = [validation_sequences[i:i+sequence_length] for i in range(len(validation_sequences)-sequence_length+1)]
y_val = validation_data_labels[sequence_length-1:]
X_val = torch.tensor(X_val)
y_val = torch.tensor(y_val)
print(X_val.shape)
print(y_val.shape)


In [None]:
# Step 4: Define Dataset and DataLoader
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return torch.tensor(self.X[index]), torch.tensor(self.y[index])

train_dataset = CustomDataset(X, y)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# Step 5: Model Definition
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out)
        return out

In [None]:
model = SimpleRNN(input_size=5, hidden_size=64, output_size=15)

In [None]:
# Step 6: Loss Function, Optimizer, Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 1
for epoch in range(epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Step 7: Evaluation
with torch.no_grad():
    model.eval()
    test_inputs = torch.tensor(X_val)
    test_labels = torch.tensor(y_val)
    test_outputs = model(test_inputs.float())
    _, predicted = torch.max(test_outputs, 1)
    accuracy = (predicted == test_labels).sum().item() / len(test_labels)

print("Test Accuracy:", accuracy)