In [10]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
from camel_tools.utils.normalize import normalize_unicode, normalize_alef_maksura_ar, normalize_alef_ar, normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
from camel_tools.tokenizers.word import simple_word_tokenize
from nltk.stem.isri import ISRIStemmer
import csv

In [11]:
def write_csv(filepath, outputpath):
    # st = ISRIStemmer()
    # Reading the data inside the xml
    with open(filepath, 'r', encoding="utf8") as f:
        data = f.read()

    # Passing the stored data inside
    # the beautifulsoup parser
    Bs_dataset = BeautifulSoup(data, "xml")
    tokens = []
    tags = []
    polarities = []
    for sentence in Bs_dataset.find_all('sentence'):
        drop = False
        aspects = [x['target'] for x in sentence.find_all('Opinion') if x['target'] != 'NULL']
        # print(aspects)  
        # process the aspect so it can be used later on to create the target
        for i, aspect in enumerate(aspects[:]):
            aspects[i] = normalize_unicode(aspects[i])
            # Normalizing alef variants to (ا)
            aspects[i] = normalize_alef_ar(aspects[i])
            # Normalizing alef maksura (ى) to yeh (ي)
            aspects[i] = normalize_alef_maksura_ar(aspects[i])
            # Normalizing teh marbuta (ة) to heh (ه)
            aspects[i] = normalize_teh_marbuta_ar(aspects[i])
            # removing Arabic diacritical marks
            aspects[i] = dediac_ar(aspects[i])
            # split each aspect
            aspects[i] = simple_word_tokenize(aspects[i])  
            # aspects[i] = [st.stem(word) for word in aspects[i]]

        text = sentence.text
        # process the text 
        text = normalize_unicode(text)
        # Normalizing alef variants to (ا)
        text = normalize_alef_ar(text)
        # Normalizing alef maksura (ى) to yeh (ي)
        text = normalize_alef_maksura_ar(text)
        # Normalizing teh marbuta (ة) to heh (ه)
        text = normalize_teh_marbuta_ar(text)
        # removing Arabic diacritical marks
        text = dediac_ar(text)
        # split the text 
        text_split = simple_word_tokenize(text)

        # text_split = [st.stem(word) for word in text_split]
        
        # create target list where the start of the aspect is 1, the inside is 2, and non aspects are 0
        row_tags = np.zeros((len(text_split),), dtype=np.int16)
        for aspect in aspects:
            # if the aspct contain multiaple words 
            for i, word in enumerate(aspect):
                try:
                    # assign one for the start of the aspect
                    if i == 0:
                        row_tags[text_split.index(word)] = 1
                    # assign 2 for the remaining words of the aspect
                    else:
                        row_tags[text_split.index(word)] = 2
                except ValueError:
                    drop = True
                    # print(word)
        if not drop:
            sentiments = [x['polarity'] for x in sentence.find_all('Opinion') if x['target'] != 'NULL']
            # create target list where the start of the aspect is 1, the inside is 2, and non aspects are 0
            row_polarities = np.empty((len(text_split),), dtype=np.int16)
            row_polarities[:] = -1
            for aspect, aspect_sentiment in zip(aspects, sentiments):
                # if the aspct contain multiaple words 
                for i, word in enumerate(aspect):
                    try:
                        if aspect_sentiment == 'positive':
                            row_polarities[text_split.index(word)] = 2
                        elif aspect_sentiment == 'neutral':
                            row_polarities[text_split.index(word)] = 1
                        elif aspect_sentiment == 'negative':
                            row_polarities[text_split.index(word)] = 0
                    except ValueError:
                        print('unexpected error')
                                     
            tokens.append(text_split)
            tags.append(row_tags.flatten().tolist())
            polarities.append(row_polarities.flatten().tolist())


    dict = {'Tokens': tokens, 'Tags': tags, 'Polarities': polarities}     
    df = pd.DataFrame(dict) 
    # print(len(tokens))
    # print(len(tags))
    # print(len(polarities))
    # print(tokens[:3])
    # print(tokens[:3])
    # print(rows[:3])
    # print(text_split)
    # print(aspects)
    # print(target)
    # saving the dataframe 
    df.to_csv(outputpath, index=False) 

In [12]:
write_csv('data/Arabic_Hotels_TrD_V2.xml', 'dataset/arabic_train.csv')

In [13]:
training_data = pd.read_csv('dataset/arabic_train.csv')

In [14]:
print(training_data.shape)
training_data.head()

(3603, 3)


Unnamed: 0,Tokens,Tags,Polarities
0,"['انصح', 'بالنوم', 'وليس', 'تناول', 'الطعام', ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1..."
1,"['فندق', 'يتميز', 'بمرافق', 'نوعيه', 'وخلاقه',...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1..."
2,"['لمسه', 'بحريه', 'جميل', 'وظريفه', '،', 'فندق...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1]"
3,"['ساوصي', 'بالتاكيد', 'بموقع', 'المدينه', 'الق...","[0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0]","[-1, -1, -1, 2, 2, -1, -1, -1, -1, -1, 2, 2, -..."
4,"['فريق', 'العمل', 'الودود', 'والمتعاون', 'علي'...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,..."


In [15]:
write_csv('data/AR_HOTE_SB1_TEST.xml.gold', 'dataset/arabic_test.csv')

In [16]:
testing_data = pd.read_csv('dataset/arabic_train.csv')

In [17]:
print(testing_data.shape)
testing_data.head()

(3603, 3)


Unnamed: 0,Tokens,Tags,Polarities
0,"['انصح', 'بالنوم', 'وليس', 'تناول', 'الطعام', ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1..."
1,"['فندق', 'يتميز', 'بمرافق', 'نوعيه', 'وخلاقه',...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1..."
2,"['لمسه', 'بحريه', 'جميل', 'وظريفه', '،', 'فندق...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1]"
3,"['ساوصي', 'بالتاكيد', 'بموقع', 'المدينه', 'الق...","[0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0]","[-1, -1, -1, 2, 2, -1, -1, -1, -1, -1, 2, 2, -..."
4,"['فريق', 'العمل', 'الودود', 'والمتعاون', 'علي'...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,..."


seems like there's a problem in the data the aspects sometimes doesn't match the text for example an aspect could be والفندق but in the text it's الفندق and many other diffrences, even after stemming it got down from 300 to 100 but there's still some spelling and other possible mistakes.

to ensure the data quality, and avoid any possible mistake in the data, any row with an error would be dropped.