In [1]:

import re
from collections import deque
import requests
import datetime
import csv
from tqdm import tqdm
import json
import pickle

In [2]:
#Loading actors_id_set : a set that contains the id of all the pages that we have scrapped before
#to avoid duplicates. 
actors_ids_set = pickle.load( open( "wiki_actors_page_ids.p", "rb" ) )
len(actors_ids_set)

6166

In [4]:
def extract_category_members(cmtitle, lang):
    global actors_ids_set
    wiki_base_url = 'http://en.wikipedia.org/w/api.php' if lang == 'en' else 'http://ar.wikipedia.org/w/api.php'
    attributes = {} 
    attributes['action'] ='query'
    attributes['list'] ='categorymembers' #Person name
    attributes['cmtitle'] = cmtitle
    attributes['cmlimit'] = 500
    attributes['cmtype'] = 'page'
    attributes['format'] = 'json'
    
    data = requests.get(wiki_base_url , attributes)
    data_json = data.json()
    ministers =[]
    for i in data_json['query']['categorymembers']:
        if i['pageid'] not in actors_ids_set : 
            actors_ids_set.add(i['pageid'])
            if lang == 'en':
                ministers.append({"en_title": i['title'] , "en_pageid" : i['pageid'],"ar_title": '' , "ar_pageid" : ''   })
            else:
                ministers.append({"en_title": '' , "en_pageid" : '',"ar_title": i['title'] , "ar_pageid" : i['pageid']  })
        
            
    return ministers
        

def get_lang_links(person_name): 
    wiki_base_url = 'http://ar.wikipedia.org/w/api.php'
    attributes = {} 
    attributes['action'] ='query'
    attributes['titles'] =person_name #Person name
    attributes['prop'] = 'langlinks'
    attributes['format'] = 'json'
    person_lang_links = requests.get(wiki_base_url , attributes)
    data = person_lang_links.json()
    #check if the person has any other pages with other languages
 
    if list(data['query']['pages'].keys())[0] == '-1' : #person not found
        return -1
    else : # get arabic  name or any other langue
        try:
#             print (list(data['query']['pages'].keys())[0])
            page_id = list(data['query']['pages'].keys())[0]
            links = data['query']['pages'][list(data['query']['pages'].keys())[0]]['langlinks']
#             return links
            target_link =[]
            target_link = [x['*'] for x in links if x['lang'] =='en']
            if target_link == []:
                return [-2, 0] #unique
            
            return [target_link[0], page_id]
        except KeyError:
            return [-2,0]  # don't have english link : unique 


# test_title = 'تصنيف:وزراء_حكومة_الإمارات_العربية_المتحدة'
# x = extract_category_members(test_title , 'ar')

# test_title2 = 'Category:Government_ministers_of_the_United_Arab_Emirates'
# x2 = extract_category_members(test_title2 , 'en')

In [5]:
#read text file 
#we could do the processing line by line, but we have a relativly small number of lines 
 #so load them all at onces
all_unique_actors = [] # our gold
# actors_ids_set = set()
wiki_links = []
with open('wiki_links3.csv', 'r') as f:
    reader = csv.reader(f)
    wiki_links = list(reader)
wiki_links

print("Processing .......")


wiki_par = tqdm(wiki_links);

for i in wiki_par:
    wiki_par.set_description("Processing %s" % i[1].strip())
    
    if i[0] =='0' : # it is a category for Governments
        #Processing English Names
        ministers = extract_category_members(i[1] ,'en')
        #Processing Arabic 
        
        if(i[2].strip() == 'NAN') : #does not have an equivalent arabic page , just add them
            all_unique_actors.extend(ministers)
            pass
        
        else : # check the correspond arabic ones 
#             print("***** : " , i[1], "ss" ,  i[2])
            ar_ministers = extract_category_members(i[2] ,'ar')
            for j in ar_ministers : 
                result = get_lang_links(j['ar_title'])
                if result[0] == -2 : # unique entry : add 
                    all_unique_actors.append(j)
                else : # it's not unique and we should already have it + append to original data
                    target_index = next((index for (index, d) in enumerate(ministers) if d["en_title"] == result[0]), None)
                    if target_index == None : # unique, wasn't caught earlier 
                        if result[0] != -1 : 
                            all_unique_actors.append({"en_title": '' , "en_pageid" : '',"ar_title": result[0] , "ar_pageid" : result[1]  })     
                    else : 
                        ministers[target_index]['ar_title'] = j['ar_title']
                        ministers[target_index]['ar_pageid'] = j['ar_pageid']
        all_unique_actors.extend(ministers)
    



  0%|          | 0/23 [00:00<?, ?it/s]

Processing .......


Processing Category:Heads_of_government_of_the_Russian_Federation: 100%|██████████| 23/23 [01:42<00:00,  4.72s/it]                    


In [6]:
len(all_unique_actors)

1361

In [7]:
len(actors_ids_set)

7316

In [8]:
#check for duplicates (if any and remove)
#english
x = [ x['en_title'] for x in all_unique_actors if x['en_title'] !='']
z= set(x)
print(len(x) == len(z))
#arabic
x = [ x['ar_title'] for x in all_unique_actors if x['ar_title'] !='']
z= set(x)
print(len(x) == len(z))

False
True


In [9]:
#cleaning duplicate arabic data 
from collections import Counter
dup = [k for k,v in Counter(x).items() if v>1]

[]

In [10]:
for i in dup : 
    target_index = next((index for (index, d) in enumerate(all_unique_actors) if d["ar_title"] == dup[0]), None)
    del all_unique_actors[target_index]


In [56]:
#writing final output and saving progress

with open('wiki_actors_3', 'w') as fout:
    json.dump(all_unique_actors, fout)
    

pickle.dump(actors_ids_set , open('wiki_actors_page_ids.p', 'wb'))

In [None]:
#saving our actors set to be used next time 

In [18]:
#wiki_links 1 : 
# 1449
# 2185
print(len(all_unique_actors))
print(len(actors_ids_set))

1449
2185
