In [2]:
import pandas as pd
import numpy as np
import pyarabic as araby
import tensorflow as tf

In [3]:
file_path = 'arramooz-master/verbs_with_madhi_mudhori_amar.csv'
df = pd.read_csv(file_path, encoding='utf-8')


In [4]:
# Hapus kolom 'Triliteral'
df.drop(columns=['Triliteral'], inplace=True)


In [156]:
print(df['Vocalized'].head(10))

0        أمْكَنَ
1    اِسْتَحَالَ
2          زَهَا
3         بِئْسَ
4        أَغْمَى
5         سَقَطَ
6         نِعْمَ
7         وَهَبَ
8      اِنْبَغَى
9         بَرِحَ
Name: Vocalized, dtype: object


In [None]:
from pyarabic.araby import tokenize, is_arabicrange, strip_tashkeel

def tokenize_arabic_text(text):
    if isinstance(text, list):
        text = ' '.join(text)
    return tokenize(text, conditions=is_arabicrange)

df['Vocalized'] = df['Vocalized'].apply(tokenize_arabic_text)
df['Unvocalized'] = df['Unvocalized'].apply(tokenize_arabic_text)
df['Root'] = df['Root'].apply(tokenize_arabic_text)
df['Madhi'] = df['Madhi'].apply(tokenize_arabic_text)
df['Mudhori'] = df['Mudhori'].apply(tokenize_arabic_text)
df['Amar'] = df['Amar'].apply(tokenize_arabic_text)
print(df[['Vocalized', 'Unvocalized', 'Root','Madhi', 'Mudhori', 'Amar']].head())



       Vocalized Unvocalized   Root          Madhi        Mudhori  \
0      [أمْكَنَ]      [أمكن]  [مكن]     [أَمْكَنَ]     [يُمْكِنُ]   
1  [اِسْتَحَالَ]    [استحال]  [حيل]  [اِسْتَحَالَ]  [يَسْتَحِيلُ]   
2        [زَهَا]       [زها]  [زهو]        [زَهَا]     [يَزْهِوُ]   
3       [بِئْسَ]       [بئس]  [بأس]       [بَئِسَ]     [يَبْئِسُ]   
4      [أَغْمَى]      [أغمى]  [غمي]      [أَغْمَى]      [يُغْمِي]   

           Amar  
0    [أَمْكِنْ]  
1  [اِسْتَحِلْ]  
2      [اِزْهِ]  
3    [اِبْئِسْ]  
4      [أَغْمِ]  


# Persiapan Data

In [None]:
df['Combined'] = df.apply(lambda row: [item for sublist in row.values if isinstance(sublist, (list, tuple)) for item in sublist], axis=1)

# Cetak hasil gabungan untuk memastikan outputnya
print(df['Combined'].head())

0    [أمكن, أمْكَنَ, مكن, أَمْكَنَ, يُمْكِنُ, أَمْك...
1    [استحال, اِسْتَحَالَ, حيل, اِسْتَحَالَ, يَسْتَ...
2           [زها, زَهَا, زهو, زَهَا, يَزْهِوُ, اِزْهِ]
3       [بئس, بِئْسَ, بأس, بَئِسَ, يَبْئِسُ, اِبْئِسْ]
4       [أغمى, أَغْمَى, غمي, أَغْمَى, يُغْمِي, أَغْمِ]
Name: Combined, dtype: object


In [17]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(df['Combined'])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df['Combined'])
print(word_index)
print(sequences)

{'<OOV>': 1, 'جذر': 2, '،': 3, 'خبر': 4, 'غرب': 5, 'خلف': 6, 'يمن': 7, 'أكل': 8, 'قبل': 9, 'روح': 10, 'قدم': 11, 'عدل': 12, 'عمر': 13, 'رفق': 14, 'وضع': 15, 'نعم': 16, 'ودع': 17, 'نزل': 18, 'رفع': 19, 'يسر': 20, 'قصر': 21, 'نجد': 22, 'درس': 23, 'ورع': 24, 'وجب': 25, 'وقر': 26, 'خلق': 27, 'شكل': 28, 'حضر': 29, 'نشر': 30, 'نحل': 31, 'رجع': 32, 'طلق': 33, 'سلم': 34, 'قرب': 35, 'سخن': 36, 'نقب': 37, 'قطع': 38, 'فرق': 39, 'عسر': 40, 'كبر': 41, 'شعر': 42, 'مثل': 43, 'حدث': 44, 'هجم': 45, 'حلل': 46, 'وقع': 47, 'أمر': 48, 'ألف': 49, 'أنس': 50, 'عقل': 51, 'زمر': 52, 'حمل': 53, 'نفس': 54, 'رمد': 55, 'كفل': 56, 'بشر': 57, 'ظهر': 58, 'نفر': 59, 'حسر': 60, 'قسم': 61, 'ضرب': 62, 'قدر': 63, 'رطب': 64, 'علم': 65, 'لحم': 66, 'عرض': 67, 'عرف': 68, 'حكم': 69, 'صدر': 70, 'سكن': 71, 'عظم': 72, 'وسط': 73, 'نكب': 74, 'بطن': 75, 'عتب': 76, 'كتف': 77, 'ظلم': 78, 'سكر': 79, 'خضب': 80, 'فرع': 81, 'ركب': 82, 'جنب': 83, 'حلم': 84, 'نصب': 85, 'خلص': 86, 'جلب': 87, 'قرض': 88, 'غمض': 89, 'عور': 90, 'قلع': 91, 'نصف': 

In [19]:
def padding_sequence(sequences, padding_type='post',truncating='post', maxlen=6):
    padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding=padding_type, truncating=truncating, maxlen=maxlen)
    return padded
padded_sequences = padding_sequence(sequences)
print(padded_sequences[0])
print(padded_sequences.shape)

[ 3801 15782   359  2770  3802  3803]
(10637, 6)


# Preprocessing Data

In [127]:
def prepare_inputs(inputs, labels=None):
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "decoder_input_ids": inputs["input_ids"],  # Provide decoder input ids
    }, labels

train_tf_dataset = train_tf_dataset.map(prepare_inputs)
val_tf_dataset = val_tf_dataset.map(prepare_inputs)

# Model Building

In [129]:
# Prepare for model training
try:
    # Train the model
    history = model.fit(
        train_tf_dataset,
        validation_data=val_tf_dataset,
        epochs=30,
        batch_size=32
    )
except Exception as e:
    history = str(e)

# Check the result of training or errors
history

Epoch 1/30
 10/532 [..............................] - ETA: 58:49 - loss: 0.4194 - accuracy: 0.8994 

'Graph execution error:\n\nDetected at node tft5_for_conditional_generation/encoder/shared/embedding_lookup defined at (most recent call last):\n  File "<frozen runpy>", line 198, in _run_module_as_main\n\n  File "<frozen runpy>", line 88, in _run_code\n\n  File "c:\\Users\\dzak\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\ipykernel_launcher.py", line 18, in <module>\n\n  File "c:\\Users\\dzak\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\traitlets\\config\\application.py", line 1075, in launch_instance\n\n  File "c:\\Users\\dzak\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\ipykernel\\kernelapp.py", line 739, in start\n\n  File "c:\\Users\\dzak\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tornado\\platform\\asyncio.py", line 205, in start\n\n  File "c:\\Users\\dzak\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\asyncio\\base_events.py", line 607, in run_forever\n\n  File "c:\\Users\\dzak\\AppData\\Lo

In [130]:
print(tokenizer.encode("Your input text here"))


[2, 55181, 203, 17323, 178, 13640, 89, 217, 263, 221, 77, 53451, 3]
