In [1]:
import jsonlines
import re
import requests
import pyarabic.araby as araby

### Preprocessing 

In [2]:
with jsonlines.open("highlight_assault.jsonl", "r") as f:
    assault = list(f.iter())
	
def actor_spans(entry):
    chunks = []
    for s in entry['spans']:
        if s['label'] in ['SOURCE_ACTOR', 'TARGET_ACTOR']:
            chunk = entry['text'][s['start']:s['end']+1]
            chunks.append(chunk)
    return chunks
            
actor_spans(assault[55])

all_chunks = []
for entry in assault:
    try:
        c = actor_spans(entry)
        if c:
            all_chunks.extend(c)
    except Exception as e:
        print(e)
		
        
len(all_chunks)
len(set(all_chunks))

'spans'
'spans'
'spans'
'spans'
'spans'
'spans'
'spans'
'spans'
'spans'
'spans'


856

In [3]:
# Get data from udpipe
def udpipe(string) :
    # Prepping String 
    words  = re.findall(u'[\u0600-\u06FF]+', string) #getting arabic characters 
    data = ' '.join(words)
    
    pipe_base_url = 'http://lindat.mff.cuni.cz/services/udpipe/api/process?tokenizer&tagger' 
    attributes = {} 
    attributes['model'] = 'arabic-ud-2.0-170801'
    attributes['data'] = data

    data = requests.get(pipe_base_url , attributes)
    result = data.json()['result'].split('\n')
    udpipe_results = [re.findall(u'[\u0600-\u06FF]+', i) for i in result] # cleaning 
    udpipe_results = [i for i in udpipe_results if i !=[]]
    
    return udpipe_results

def udpipe_reconstruct(original_text) : 
    udpipe_results = udpipe(original_text)
    for i in range(1, len(udpipe_results)):
        if len(udpipe_results[i]) == 1 : # composite verbs get the next two words
            original_text =original_text.replace(udpipe_results[i][0] , '{} {}'.format(udpipe_results[i+1][1],udpipe_results[i+2][1]))
        else :
            original_text =original_text.replace(udpipe_results[i][0] , udpipe_results[i][1])
    original_text = araby.strip_tashkeel(original_text)
    return original_text


def master_reconstruct_input(text_input , input_type): 
    # type 0: none-rule just do just do udpipe_reconstruction
    # type 1 : fix the rule + do udpipe_reconstruction
    if input_type == 0:
        return udpipe_reconstruct(text_input)
    else : 
        return udpipe_reconstruct(rule(text_input))

In [8]:
chunks_unique = set(all_chunks)
len(chunks_unique)
list(chunks_unique)[:10]

['مهاجم انتحاري ',
 'العراقيين ',
 'قوة من الشرطة العراقية ',
 'جنديا باكستانيا ',
 'الجيش الأميركي ',
 'مجهولين ',
 'التحالف الدولي بقيادة الولايات المتحدة في افغانستان،',
 'مقاتلي ',
 'وكيل وزارة النفط العراقية عبد الجبار الوكاع ',
 'المتشددين ']

In [5]:
chunks_unique_udpipe = [] 
for i in chunks_unique : 
    chunks_unique_udpipe.append(master_reconstruct_input(i ,0))

In [6]:
len(chunks_unique_udpipe)

856

In [7]:
list(chunks_unique_udpipe)[:10]

['مهاجم انتحاري ',
 'عراقي ',
 'قوة من شرطة عراقي ',
 'جندي باكستانيا ',
 'جيش أميركي ',
 'مجهول ',
 'تحالف دولي ب قيادة ولاية متحد في أفغانستان،',
 'مقاتل ',
 'وكيل وزارة نفط عراقي عبد الجبار الوكاع ',
 'متشدد ']

In [10]:
with open('actors_txt_file.txt',  'w') as outputf :
    outputf.write('<Sentences>\n')
    for i in range(len(chunks_unique_udpipe )): 
        outputf.write('<Sentence date = "20000715" id="{}" source = "afp" sentence = "True">\n'.format(i))
        outputf.write('<Text>\n{}\n</Text>\n'.format(chunks_unique_udpipe[i]))
        outputf.write('</Sentence>\n')
    outputf.write('</Sentences>')


### proprocess the actor_txt_file.txt using UP preprocessing, then run the actor_code_extraction code to get the following results 

#### each element is actors_results is a list of 3 elements : Sentence (original Text) [0] , Matched Text[1], Code[2] 

In [2]:
actors_results = [] 
with open('out_actors'  , 'r') as inputf: 
    for i in inputf : 
        actors_results.append(i.replace('\n','').split(','))
actors_results[:10]

[[' مهاجم انتحاري ', ' انتحاري', '~REB'],
 [' عراقي ', '---', '---'],
 [' قوة من شرطة عراقي ', ' شرطة عراقي', 'IRQCOP'],
 [' جندي باكستانيا ', '---', '---'],
 [' جيش أميركي ', '---', '---'],
 [' مجهول ', '---', '---'],
 [' تحالف دولي ب قيادة ولاية متحد في أفغانستان، ', ' ولاية متحد', 'USALEG'],
 [' مقاتل ', '---', '---'],
 [' وكيل وزارة نفط عراقي عبد الجبار الوكاع ', ' وزارة', 'GOV'],
 [' متشدد ', '---', '---']]

In [8]:
#Filter stuff 
extracted_actors_with_code = [] 
extracted_actors_with_out_code =[] 

for i in actors_results : 
    if i[2] != '---' : 
        extracted_actors_with_code.append(i)
    else : 
        extracted_actors_with_out_code.append(i)

print('Total Number of actors processed : {}'.format(len(actors_results)))
print('Extracted Actors with code : {} | {}%'.format(len(extracted_actors_with_code) , len(extracted_actors_with_code)/len(actors_results)))
print('Extracted Actors with out code : {} | {}%'.format(len(extracted_actors_with_out_code) , len(extracted_actors_with_out_code)/len(actors_results)))

Total Number of actors process : 856
Extracted Actors with code : 223 | 0.2605140186915888%
Extracted Actors with out code : 633 | 0.7394859813084113%
