In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
from camel_tools.utils.normalize import normalize_unicode, normalize_alef_maksura_ar, normalize_alef_ar, normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
from camel_tools.tokenizers.word import simple_word_tokenize
from nltk.stem.isri import ISRIStemmer
import csv

In [37]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
from camel_tools.utils.normalize import normalize_unicode, normalize_alef_maksura_ar, normalize_alef_ar, normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
from camel_tools.tokenizers.word import simple_word_tokenize
from nltk.stem.isri import ISRIStemmer
import csv

def write_json_IOB(texts, texts_aspects, outputpath, normalize=True):
    '''
    texts: a list of input texts
    texts_aspects: a list containing lists of aspects for each input text
    outputpath: path to save the data
    normalize: boolean flag to indicate whatever to normalize input arabic text
    '''
    tokens = []
    tags = []
    for i in range(len(texts)):
        drop = False
        aspects = texts_aspects[i]
        text = texts[i]
        # process the aspect so it can be used later on to create the target
        if normalize:
            for j, aspect in enumerate(aspects[:]):
                aspects[j] = normalize_unicode(aspects[j])
                # Normalizing alef variants to (ا)
                aspects[j] = normalize_alef_ar(aspects[j])
                # Normalizing alef maksura (ى) to yeh (ي)
                aspects[j] = normalize_alef_maksura_ar(aspects[j])
                # Normalizing teh marbuta (ة) to heh (ه)
                aspects[j] = normalize_teh_marbuta_ar(aspects[j])
                # removing Arabic diacritical marks
                aspects[j] = dediac_ar(aspects[j])
                # aspects[j] = [st.stem(word) for word in aspects[j]]
            # normalize input text
            text = normalize_unicode(text)
            # Normalizing alef variants to (ا)
            text = normalize_alef_ar(text)
            # Normalizing alef maksura (ى) to yeh (ي)
            text = normalize_alef_maksura_ar(text)
            # Normalizing teh marbuta (ة) to heh (ه)
            text = normalize_teh_marbuta_ar(text)
            # removing Arabic diacritical marks
            text = dediac_ar(text)

        # split each aspect
        for j, aspect in enumerate(aspects[:]):
            aspects[j] = simple_word_tokenize(aspects[j])  
        # split the text 
        text_split = simple_word_tokenize(text)
        # text_split = [st.stem(word) for word in text_split]
        
        # create target list where the start of the aspect is 1, the inside is 2, and non aspects are 0
        row_tags = np.zeros((len(text_split),), dtype=np.int16)
        for aspect in aspects:
            # assgin tags for the aspect
            for i, word in enumerate(aspect):
                try:
                    # assign one for the start of the aspect
                    if i == 0:
                        row_tags[text_split.index(word)] = 1
                    # assign 2 for the remaining words of the aspect
                    else:
                        row_tags[text_split.index(word)] = 2
                except ValueError:
                    drop = True
        if not drop:                         
            tokens.append(text_split)
            tags.append(row_tags.flatten().tolist())


    dict = {'Tokens': tokens, 'Tags': tags}     
    df = pd.DataFrame(dict) 

    df.to_json(outputpath) 

In [1]:
import pandas as pd
from IOB_to_json import write_json_IOB
write_json_IOB(['تجربة الان', 'تجربة للمستقبل'], [['تجربة'], ['تجربة للمستقبل']], 'dataset/arabic_train.json', normalize=False)
pd.read_json('dataset/arabic_train.json')

Unnamed: 0,Tokens,Tags
0,"[تجربة, الان]","[1, 0]"
1,"[تجربة, للمستقبل]","[1, 2]"


In [13]:
training_data = pd.read_csv('dataset/arabic_train.csv')

In [14]:
print(training_data.shape)
training_data.head()

(3603, 3)


Unnamed: 0,Tokens,Tags,Polarities
0,"['انصح', 'بالنوم', 'وليس', 'تناول', 'الطعام', ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1..."
1,"['فندق', 'يتميز', 'بمرافق', 'نوعيه', 'وخلاقه',...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1..."
2,"['لمسه', 'بحريه', 'جميل', 'وظريفه', '،', 'فندق...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1]"
3,"['ساوصي', 'بالتاكيد', 'بموقع', 'المدينه', 'الق...","[0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0]","[-1, -1, -1, 2, 2, -1, -1, -1, -1, -1, 2, 2, -..."
4,"['فريق', 'العمل', 'الودود', 'والمتعاون', 'علي'...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,..."


In [15]:
write_csv('data/AR_HOTE_SB1_TEST.xml.gold', 'dataset/arabic_test.csv')

In [16]:
testing_data = pd.read_csv('dataset/arabic_train.csv')

In [17]:
print(testing_data.shape)
testing_data.head()

(3603, 3)


Unnamed: 0,Tokens,Tags,Polarities
0,"['انصح', 'بالنوم', 'وليس', 'تناول', 'الطعام', ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1..."
1,"['فندق', 'يتميز', 'بمرافق', 'نوعيه', 'وخلاقه',...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1..."
2,"['لمسه', 'بحريه', 'جميل', 'وظريفه', '،', 'فندق...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1]"
3,"['ساوصي', 'بالتاكيد', 'بموقع', 'المدينه', 'الق...","[0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0]","[-1, -1, -1, 2, 2, -1, -1, -1, -1, -1, 2, 2, -..."
4,"['فريق', 'العمل', 'الودود', 'والمتعاون', 'علي'...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,..."


seems like there's a problem in the data the aspects sometimes doesn't match the text for example an aspect could be والفندق but in the text it's الفندق and many other diffrences, even after stemming it got down from 300 to 100 but there's still some spelling and other possible mistakes.

to ensure the data quality, and avoid any possible mistake in the data, any row with an error would be dropped.