# Citizen Lab Classification Domain Matching

In [1]:
import pandas, re, os, multiprocessing, tqdm, tldextract

In [2]:
# Loads unique domains from AOL-treated dataset into aol DataFrame.
aol = pandas.read_csv('AOL-treated-unique-domains.csv',
                      low_memory=False, on_bad_lines='warn',
                      usecols=[1]).rename(columns={'0':'domain'})

In [3]:
# Loads aggregated Citizen Lab test list classification into topics DataFrame.
topics = pandas.read_csv('Citizen-Lab-Classification.csv',
                         low_memory=False, on_bad_lines='warn', usecols=[1,2])

In [4]:
display(aol)

Unnamed: 0,domain
0,tintonfallsrecreation.com
1,elevationbaptist.org
2,localchurches.org
3,fabolousworld.com
4,majestictheatre.net
...,...
1300479,suresave.com
1300480,missouricosmo.com
1300481,dimpres.com
1300482,hempel.com


In [5]:
display(topics)

Unnamed: 0,domain,topics
0,000webhost.com,'HOST'
1,0039bet.com,'GMB'
2,0039casino.com,'GMB'
3,0039casino.eu,'GMB'
4,003ms.ru,'PUBH'
...,...,...
27787,zwnews.com,'NEWS'
27788,zwwebdirectory.com,'NEWS'
27789,zyzoom.net,'ANON'
27790,zzgays.com,'PORN'


In [6]:
aol_unique = set(aol['domain'].to_list())

In [7]:
topics_unique = set(topics['domain'].to_list())

In [8]:
display(len(aol_unique.intersection(topics_unique)))

3575

In [9]:
%%time
size_1 = []
size_2 = []
size_3 = []
size_4 = []
size_5 = []
check = []

# Separate domains from AOL-treated dataset according to number of levels.
# For instance, 'gov.br' into size_2, 'saude.gov.br' into size_3.
for case in aol_unique:
    count = len(re.findall(r'\.', case))
    if count == 0:
        size_1.append(case)
    elif count == 1:
        size_2.append(case)
    elif count == 2:
        size_3.append(case)
    elif count == 3:
        size_4.append(case)
    elif count == 4:
        size_5.append(case)
    else:
        check.append(case)

print(len(size_1),len(size_2),len(size_3),len(size_4),len(size_5),len(check))

0 1194526 99442 6494 22 0
CPU times: user 1.57 s, sys: 23 ms, total: 1.59 s
Wall time: 1.59 s


In [10]:
%%time
# Takes only domains with three or more levels.
aol_unique_reduced = []
aol_unique_reduced.extend(size_3)
aol_unique_reduced.extend(size_4)
aol_unique_reduced.extend(size_5)

CPU times: user 4.16 ms, sys: 64 µs, total: 4.23 ms
Wall time: 4.14 ms


In [11]:
print(len(aol_unique_reduced))

105958


## Multiprocessing

In [12]:
# Setup of multiprocessing library for multithreaded domain matching.
data1 = None
data2 = None

def init(_data1,_data2):
    global data1
    data1 = _data1
    global data2
    data2 = _data2

pool_size = multiprocessing.cpu_count() * 1

os.system('taskset -cp 0-%d %s' % (pool_size, os.getpid()))

jobs_count = pool_size * 10

n = round(len(aol_unique_reduced)/jobs_count)
l = [list(aol_unique_reduced)[i:i + n] for i in range(0, len(list(aol_unique_reduced)), n)]

def job(l):
    topics_unique = set(topics['domain'].to_list())
    topics_unique_re = [re.compile('.*\.'+case) for case in topics_unique]
    full = []
    partial = {}
    for dom in l:
        # Appends domain from AOL-treated dataset if also in the aggregated Citizen Lab test list.
        if dom in topics_unique:
            full.append(dom)
        # Tries to perform a partial match otherwise, i.e. map a subdomain to the respective domain.
        # For instance, 'covid.saude.gov.br' would partially match 'saude.gov.br'.
        else:
            partial[dom] = [case.pattern.strip('.*\\.') for case in topics_unique_re if any(map(case.search, [dom])) == True]
    return(full, partial)

with multiprocessing.Pool(processes=pool_size, maxtasksperchild=1, initializer=init,
                          initargs=(topics[['domain','topics']])) as pool:
    pool_outputs = list(tqdm.tqdm(pool.imap(job, l),
                                  total=sum(1 for _ in l)))
    pool.close()
    pool.join()

pid 1087770's current affinity list: 0-39
pid 1087770's new affinity list: 0-39


100%|████████████████████| 400/400 [03:34<00:00,  1.86it/s]


In [13]:
%%time
# Collects multiprocessing results.
full_match = []
partial_match = {}
for output in pool_outputs:
    if len(output[0]) > 0:
        full_match.extend(output[0])
    for k, v in output[1].items():
        if len(v) > 0:
            partial_match[k] = v
full_match.extend(list(set(size_2).intersection(topics_unique)))

CPU times: user 369 ms, sys: 36.8 ms, total: 406 ms
Wall time: 399 ms


In [14]:
print(len(full_match), len(partial_match))

3575 1394


In [15]:
display(full_match)

['pchome.com.tw',
 'fbc.com.my',
 'proceso.com.mx',
 'yam.org.tw',
 'radioformula.com.mx',
 'nst.com.my',
 'almanhaj.or.id',
 'congress.gov.ph',
 'ig.com.br',
 'china.org.cn',
 'truecorp.co.th',
 'education.gouv.fr',
 'cdc.gov.tw',
 'elnuevodia.com.co',
 'issi.org.pk',
 'eloccidental.com.mx',
 'moh.gov.my',
 'housingauthority.gov.hk',
 'utusan.com.my',
 'catholic.org.tw',
 'simya.com.ua',
 'yahoo.com.br',
 'lancenet.com.br',
 'eltiempo.com.pe',
 'tase.co.il',
 'gamebase.com.tw',
 'tasmc.org.il',
 'necf.org.my',
 'lacapital.com.ar',
 'larazon.com.pe',
 'philonline.com.ph',
 'elsiglodedurango.com.mx',
 'bharian.com.my',
 'fabio.com.ar',
 'univalle.edu.co',
 'mg.co.za',
 'president.gov.ua',
 'conapred.org.mx',
 'wilkinsonpc.com.co',
 'bn.org.pl',
 'ananzi.co.za',
 'rtaf.mi.th',
 'ojo.com.pe',
 'elmercurio.com.ec',
 'dantri.com.vn',
 'books.com.tw',
 'tei.or.th',
 'me.gob.ve',
 'baheyya.blogspot.com',
 'caracol.com.co',
 'btl.gov.il',
 'big.or.jp',
 'lanacion.com.co',
 'sunstar.com.ph',
 '

In [16]:
display(partial_match)

{'maib.gov.uk': ['gov.uk'],
 'ica.gov.sg': ['gov.sg'],
 'tijuana.gob.mx': ['gob.mx'],
 'fjqz.gov.cn': ['gov.cn'],
 'dfee.gov.uk': ['gov.uk'],
 'semar.gob.mx': ['gob.mx'],
 'preston.gov.uk': ['gov.uk'],
 'mas.gov.sg': ['gov.sg'],
 'hermosillo.gob.mx': ['gob.mx'],
 'ndmctsgh.edu.tw': ['edu.tw'],
 'coremisgm.gob.mx': ['gob.mx'],
 'stj.gov.br': ['gov.br'],
 'tampico.gob.mx': ['gob.mx'],
 'ura.gov.sg': ['gov.sg'],
 'anglesey.gov.uk': ['gov.uk'],
 'fruit.gov.cn': ['gov.cn'],
 'police.gov.il': ['gov.il'],
 'crimereduction.gov.uk': ['gov.uk'],
 'peatlandsni.gov.uk': ['gov.uk'],
 'ynysmon.gov.uk': ['gov.uk'],
 'oxon-lea.gov.uk': ['gov.uk'],
 'guasave.gob.mx': ['gob.mx'],
 'huadu.gov.cn': ['gov.cn'],
 'oundle.gov.uk': ['gov.uk'],
 'ssa-sin.gob.mx': ['gob.mx'],
 'northtyneside.gov.uk': ['gov.uk'],
 'sps.gov.uk': ['gov.uk'],
 'gos.gov.uk': ['gov.uk'],
 'sutton.gov.uk': ['gov.uk'],
 'labour.gov.za': ['gov.za'],
 'sasar.gov.za': ['gov.za'],
 'acra.gov.sg': ['gov.sg'],
 'pr.gov.br': ['gov.br'],
 'dat

In [17]:
%%time
# Checks if any partial match has matched more than one domain.
for k, v in partial_match.items():
    if len(v) != 1:
        print(v)

CPU times: user 578 µs, sys: 0 ns, total: 578 µs
Wall time: 587 µs


In [18]:
%%time
# Organizes data into a dictionary to be loaded into a DataFrame.
d = {'domain': [], 'match': [], 'topics': []}
for case in full_match:
    d['domain'].append(case)
    d['match'].append(case)
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])
for k, v in partial_match.items():
    d['domain'].append(k)
    d['match'].append(v[0])
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])

CPU times: user 15.8 s, sys: 0 ns, total: 15.8 s
Wall time: 15.8 s


In [19]:
display(d)

{'domain': ['pchome.com.tw',
  'fbc.com.my',
  'proceso.com.mx',
  'yam.org.tw',
  'radioformula.com.mx',
  'nst.com.my',
  'almanhaj.or.id',
  'congress.gov.ph',
  'ig.com.br',
  'china.org.cn',
  'truecorp.co.th',
  'education.gouv.fr',
  'cdc.gov.tw',
  'elnuevodia.com.co',
  'issi.org.pk',
  'eloccidental.com.mx',
  'moh.gov.my',
  'housingauthority.gov.hk',
  'utusan.com.my',
  'catholic.org.tw',
  'simya.com.ua',
  'yahoo.com.br',
  'lancenet.com.br',
  'eltiempo.com.pe',
  'tase.co.il',
  'gamebase.com.tw',
  'tasmc.org.il',
  'necf.org.my',
  'lacapital.com.ar',
  'larazon.com.pe',
  'philonline.com.ph',
  'elsiglodedurango.com.mx',
  'bharian.com.my',
  'fabio.com.ar',
  'univalle.edu.co',
  'mg.co.za',
  'president.gov.ua',
  'conapred.org.mx',
  'wilkinsonpc.com.co',
  'bn.org.pl',
  'ananzi.co.za',
  'rtaf.mi.th',
  'ojo.com.pe',
  'elmercurio.com.ec',
  'dantri.com.vn',
  'books.com.tw',
  'tei.or.th',
  'me.gob.ve',
  'baheyya.blogspot.com',
  'caracol.com.co',
  'btl.gov

In [20]:
df = pandas.DataFrame(data=d)

In [21]:
display(df)

Unnamed: 0,domain,match,topics
0,pchome.com.tw,pchome.com.tw,'COMM'
1,fbc.com.my,fbc.com.my,'REL'
2,proceso.com.mx,proceso.com.mx,'NEWS'
3,yam.org.tw,yam.org.tw,'HUMR'
4,radioformula.com.mx,radioformula.com.mx,'NEWS'
...,...,...,...
4964,arinternet.pr.gov.br,gov.br,'REL'
4965,seguranca.sp.gov.br,gov.br,'REL'
4966,probation.homeoffice.gov.uk,gov.uk,'REL'
4967,polmil.sp.gov.br,gov.br,'REL'


In [22]:
# Checks for domains without a topic.
display(df[df['topics'] == "''"])

Unnamed: 0,domain,match,topics


In [23]:
df.to_csv('AOL-treated-Citizen-Lab-Classification-domain-match.csv')