In [37]:
import pandas as pd
import sys
import os
path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if path not in sys.path:
    sys.path.append(path)
from constants import NER
# get the first key in NERa
from fuzzywuzzy import fuzz
from constants import NER
from fuzzywuzzy import fuzz
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
# import sklearn_crfsuite
import sklearn_crfsuite
from joblib import Parallel, delayed
import multiprocessing
from transformers import pipeline
from utils.helpers import combine_sub_words
from transformers import AutoTokenizer, AutoModelForTokenClassification


In [38]:
df = pd.read_csv('../data/data_for_ner.csv')

In [39]:
df = df.sample(10000)

In [40]:
def process_word(word, NER):
    for key in NER.keys():
        if any(fuzz.ratio(word, k) >= 80 for k in key):
            return NER[key]
    return "O"

In [41]:
def generate_ner_string(text, NER):
    num_cores = multiprocessing.cpu_count()
    words = text.split()
    results = Parallel(n_jobs=num_cores)(
        delayed(process_word)(word, NER) for word in words)
    return " ".join(results)

In [42]:
df['ner'] = Parallel(n_jobs=-1)(
    delayed(generate_ner_string)(text, NER) for text in df['full_text'])

In [43]:
model_name = "CAMeL-Lab/bert-base-arabic-camelbert-ca-pos-egy"
tokenizer = AutoTokenizer.from_pretrained(model_name) # how to make the tokinzation on split by space?

pos_pipeline = pipeline("ner", model=model_name, tokenizer=tokenizer)

In [44]:
# remove any punctuation from the full text
df['full_text'] = df['full_text'].str.replace(r'[^\u0600-\u06FF\s]+', '')
df['full_text'] = df['full_text'].str.replace(r'\?|؟|،', '')


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [45]:
pos_list = []
for text in df['full_text']:
    # remove any punctuation from the text
    res = pos_pipeline(text)
    res = combine_sub_words(res)
    while(1):
        if any('##' in r['word'] for r in res):
            res = combine_sub_words(res)
        else:
            break
    res = [r['entity'] for r in res]
    res = " ".join(res)
    pos_list.append(res)
df['pos'] = pos_list


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [46]:
df['full_text_len'] = df['full_text'].apply(lambda x: len(x.split()))
df['pos_len'] = df['pos'].apply(lambda x: len(x.split()))

In [47]:
pd.set_option('display.max_colwidth', None)
df[['full_text','pos', 'full_text_len', 'pos_len']]
# print the rows that have different length
df[df['full_text_len'] != df['pos_len']]
# remove the rows that have different length
df = df[df['full_text_len'] == df['pos_len']]

In [48]:
df.shape

(9766, 5)

In [49]:
df = df[['full_text', 'pos', 'ner']]

In [142]:
data = df

In [143]:
data['ner'] = data['ner'].astype(str)
data['ner'] = data['ner'].apply(lambda x: x.split())

In [145]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [146]:
# Split the data into training and validation datasets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
# Tokenize the input sequence
input_tokenizer = Tokenizer( oov_token='UNK')
input_tokenizer.fit_on_texts(train_data['full_text'])
train_input_seq = input_tokenizer.texts_to_sequences(train_data['full_text'])
val_input_seq = input_tokenizer.texts_to_sequences(val_data['full_text'])

In [147]:
# Tokenize the output sequence
output_tokenizer = Tokenizer(filters='', lower=False)
output_tokenizer.fit_on_texts(train_data['ner'])
train_output_seq = output_tokenizer.texts_to_sequences(train_data['ner'])
val_output_seq = output_tokenizer.texts_to_sequences(val_data['ner'])

In [148]:
# Create word to index and index to word mappings for the input sequence
input_word2idx = input_tokenizer.word_index
input_idx2word = {idx: word for word, idx in input_word2idx.items()}

In [149]:
# Create named entity to index and index to named entity mappings for the output sequence
output_word2idx = output_tokenizer.word_index
output_idx2word = {idx: word for word, idx in output_word2idx.items()}

In [150]:
# Pad the input sequence
max_input_len = max(len(seq) for seq in train_input_seq)
train_input_seq = pad_sequences(train_input_seq, maxlen=max_input_len, padding='post')
val_input_seq = pad_sequences(val_input_seq, maxlen=max_input_len, padding='post')

In [151]:
max_input_len
# Pad the output sequence
max_output_len = max(len(seq) for seq in train_output_seq)
train_output_seq = pad_sequences(train_output_seq, maxlen=max_output_len, padding='post')
val_output_seq = pad_sequences(val_output_seq, maxlen=max_output_len, padding='post')
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

In [152]:
# Define the input and output dimensions for the model
input_dim = len(input_word2idx) + 1
output_dim = len(output_word2idx) + 1
# Define the embedding dimension
embedding_dim = 128

In [153]:
# Define the number of hidden units in the LSTM layer
hidden_units = 256
# import bidirectional LSTM
from tensorflow.keras.layers import Bidirectional
# import concatenate
from tensorflow.keras.layers import Concatenate,Input

In [154]:

from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.models import Model


In [155]:
# Define the input layer
inputs = Input(shape=(max_input_len,))

# Define the embedding layer
embedding = Embedding(input_dim=input_dim, output_dim=embedding_dim)(inputs)

# Define the Bidirectional LSTM layer
lstm = Bidirectional(LSTM(hidden_units, return_sequences=True) )(embedding)

# Define the output layer
outputs = Dense(output_dim, activation='softmax')(lstm)

# Define the model
model = Model(inputs=inputs, outputs=outputs)

In [157]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_input_seq, train_output_seq, validation_data=(val_input_seq, val_output_seq), batch_size=1024, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [159]:
import numpy as np
# Use the model to make predictions on new input data
text = "طريق حوارة مسكر"
new_input_seq = input_tokenizer.texts_to_sequences([text])
new_input_seq = pad_sequences(new_input_seq, maxlen=max_input_len, padding='post')

pred_output_seq = model.predict(new_input_seq)
pred_output_seq = np.argmax(pred_output_seq, axis=-1)

pred_named_entities = [output_idx2word.get(idx, '') for idx in pred_output_seq[0]]
print(pred_named_entities)
# map each word to its predicted named entity
for word, ner in zip(text.split(), pred_named_entities):
    print(f"{word} -> {ner}")

['B-LOC', 'B-LOC', 'STAT', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
طريق -> B-LOC
حوارة -> B-LOC
مسكر -> STAT
