# Citizen Lab Classification Domain Matching

In [1]:
import pandas, re, os, multiprocessing, tqdm, tldextract

In [2]:
aol = pandas.read_csv('AOL-treated-unique-domains.csv',
                      low_memory=False, on_bad_lines='warn',
                      usecols=[1]).rename(columns={'0':'domain'})

In [3]:
topics = pandas.read_csv('Citizen-Lab-Classification.csv', low_memory=False, on_bad_lines='warn', usecols=[1,2])

In [4]:
display(aol)

Unnamed: 0,domain
0,martinpeak.com
1,mohpa.com
2,museum.wa.gov.au
3,richardiiimuseum.co.uk
4,westgaappraisals.com
...,...
1300479,17designs.com
1300480,ebbro.com
1300481,goldenstatefence.com
1300482,allbabyluv.blogspot.com


In [5]:
display(topics)

Unnamed: 0,domain,topics
0,000webhost.com,'HOST'
1,0039bet.com,'GMB'
2,0039casino.com,'GMB'
3,0039casino.eu,'GMB'
4,003ms.ru,'PUBH'
...,...,...
27787,zwnews.com,'NEWS'
27788,zwwebdirectory.com,'NEWS'
27789,zyzoom.net,'ANON'
27790,zzgays.com,'PORN'


In [6]:
aol_unique = set(aol['domain'].to_list())

In [7]:
topics_unique = set(topics['domain'].to_list())

In [8]:
display(len(aol_unique.intersection(topics_unique)))

3575

In [9]:
%%time
size_1 = []
size_2 = []
size_3 = []
size_4 = []
size_5 = []
check = []

for case in aol_unique:
    count = len(re.findall(r'\.', case))
    if count == 0:
        size_1.append(case)
    elif count == 1:
        size_2.append(case)
    elif count == 2:
        size_3.append(case)
    elif count == 3:
        size_4.append(case)
    elif count == 4:
        size_5.append(case)
    else:
        check.append(case)

print(len(size_1),len(size_2),len(size_3),len(size_4),len(size_5),len(check))

0 1194526 99442 6494 22 0
CPU times: user 1.17 s, sys: 1.23 ms, total: 1.18 s
Wall time: 1.17 s


In [10]:
%%time
aol_unique_reduced = []
aol_unique_reduced.extend(size_3)
aol_unique_reduced.extend(size_4)
aol_unique_reduced.extend(size_5)

CPU times: user 2.22 ms, sys: 0 ns, total: 2.22 ms
Wall time: 2.23 ms


In [11]:
print(len(aol_unique_reduced))

105958


## Multiprocessing

In [12]:
data1 = None
data2 = None

def init(_data1,_data2):
    global data1
    data1 = _data1
    global data2
    data2 = _data2

pool_size = multiprocessing.cpu_count() * 1

os.system('taskset -cp 0-%d %s' % (pool_size, os.getpid()))

jobs_count = pool_size * 100

n = round(len(aol_unique_reduced)/jobs_count)
l = [list(aol_unique_reduced)[i:i + n] for i in range(0, len(list(aol_unique_reduced)), n)]

def job(l):
    topics_unique = set(topics['domain'].to_list())
    full = []
    partial = {}
    for dom in l:
        if dom in topics_unique:
            full.append(dom)
        else:
            partial[dom] = []
            for case in topics_unique:
                if re.search(r'.*\.'+case, dom) != None:
                    partial[dom].append(case)
    return(full, partial)

with multiprocessing.Pool(processes=pool_size, maxtasksperchild=1, initializer=init,
                          initargs=(topics[['domain','topics']])) as pool:
    pool_outputs = list(tqdm.tqdm(pool.imap(job, l),
                                  total=sum(1 for _ in l)))
    pool.close()
    pool.join()

pid 1807194's current affinity list: 0-39
pid 1807194's new affinity list: 0-39


100%|████████████████| 4076/4076 [1:38:44<00:00,  1.45s/it]


In [13]:
%%time
full_match = []
partial_match = {}
for output in pool_outputs:
    if len(output[0]) > 0:
        full_match.extend(output[0])
    for k, v in output[1].items():
        if len(v) > 0:
            partial_match[k] = v
full_match.extend(list(set(size_2).intersection(topics_unique)))

CPU times: user 352 ms, sys: 60.4 ms, total: 413 ms
Wall time: 408 ms


In [14]:
print(len(full_match), len(partial_match))

3575 1394


In [15]:
display(full_match)

['oem.com.mx',
 'lahora.com.ec',
 'tog.org.tr',
 'gov.nt.ca',
 'tcra.go.tz',
 'petra.gov.jo',
 'laopinion.com.co',
 'arp.org.py',
 'independent.co.uk',
 'eltiempo.com.ve',
 'elliberal.com.ar',
 'thenews.com.pk',
 'qu.edu.qa',
 'eci.gov.in',
 'amrc.org.hk',
 'eldebate.com.ar',
 'pucp.edu.pe',
 'akm.net.au',
 'kafkas.org.tr',
 'ig.com.br',
 'sina.com.tw',
 'delicates.co.uk',
 'ipuc.org.co',
 'face-of-muhammed.blogspot.com',
 'abc.com.py',
 'gouv.qc.ca',
 'gestion.com.pe',
 'aa.com.tr',
 'mg.co.za',
 'knesset.gov.il',
 'japantimes.co.jp',
 'pagina12.com.ar',
 'alquds.co.uk',
 'cbs.gov.il',
 'wilkinsonpc.com.co',
 'christianaid.org.uk',
 'abante.com.ph',
 'pap.gov.pk',
 'eltiempo.com.pe',
 'lanacion.com.py',
 'cic.gc.ca',
 'lancenet.com.br',
 'india.gov.in',
 'am.com.mx',
 'elpais.com.uy',
 'aksam.com.tr',
 'univalle.edu.co',
 'tasmc.org.il',
 'sudanwatch.blogspot.com',
 'acikradyo.com.tr',
 'metroradio.com.hk',
 'aja.com.pe',
 'thestandard.co.zw',
 'ect.go.th',
 'diariopopular.com.br',
 '

In [16]:
display(partial_match)

{'mawwfire.gov.uk': ['gov.uk'],
 'saa.gov.uk': ['gov.uk'],
 'tradingstandards.gov.uk': ['gov.uk'],
 'archiwa.gov.pl': ['gov.pl'],
 'wiltshire.gov.uk': ['gov.uk'],
 'nelincs.gov.uk': ['gov.uk'],
 'southwark.gov.uk': ['gov.uk'],
 'papantla.gob.mx': ['gob.mx'],
 'etenders.gov.ie': ['gov.ie'],
 'cgi.no-ip.org': ['no-ip.org'],
 'halton.gov.uk': ['gov.uk'],
 'cprm.gov.br': ['gov.br'],
 'seebc.gob.mx': ['gob.mx'],
 'ntct.edu.tw': ['edu.tw'],
 'envirowise.gov.uk': ['gov.uk'],
 'condusef.gob.mx': ['gob.mx'],
 'dtlr.gov.uk': ['gov.uk'],
 'geoplace.no-ip.org': ['no-ip.org'],
 'ss.gov.cn': ['gov.cn'],
 'landreg.gov.uk': ['gov.uk'],
 'cityofworcester.gov.uk': ['gov.uk'],
 'bancodemexico.gob.mx': ['gob.mx'],
 'tn.edu.tw': ['edu.tw'],
 'midsussex.gov.uk': ['gov.uk'],
 'environment.gov.za': ['gov.za'],
 'dvla.gov.uk': ['gov.uk'],
 'qz.gov.cn': ['gov.cn'],
 'bajacalifornia.gob.mx': ['gob.mx'],
 'ocpa.gov.uk': ['gov.uk'],
 'gpg.gov.za': ['gov.za'],
 'worcestershire.gov.uk': ['gov.uk'],
 'dda.gov.uk': ['

In [17]:
%%time
for k, v in partial_match.items():
    if len(v) != 1:
        print(v)

CPU times: user 855 µs, sys: 0 ns, total: 855 µs
Wall time: 791 µs


In [18]:
%%time
d = {'domain': [], 'match': [], 'topics': []}
for case in full_match:
    d['domain'].append(case)
    d['match'].append(case)
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])
for k, v in partial_match.items():
    d['domain'].append(k)
    d['match'].append(v[0])
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])

CPU times: user 11.5 s, sys: 1.38 ms, total: 11.5 s
Wall time: 11.5 s


In [19]:
display(d)

{'domain': ['oem.com.mx',
  'lahora.com.ec',
  'tog.org.tr',
  'gov.nt.ca',
  'tcra.go.tz',
  'petra.gov.jo',
  'laopinion.com.co',
  'arp.org.py',
  'independent.co.uk',
  'eltiempo.com.ve',
  'elliberal.com.ar',
  'thenews.com.pk',
  'qu.edu.qa',
  'eci.gov.in',
  'amrc.org.hk',
  'eldebate.com.ar',
  'pucp.edu.pe',
  'akm.net.au',
  'kafkas.org.tr',
  'ig.com.br',
  'sina.com.tw',
  'delicates.co.uk',
  'ipuc.org.co',
  'face-of-muhammed.blogspot.com',
  'abc.com.py',
  'gouv.qc.ca',
  'gestion.com.pe',
  'aa.com.tr',
  'mg.co.za',
  'knesset.gov.il',
  'japantimes.co.jp',
  'pagina12.com.ar',
  'alquds.co.uk',
  'cbs.gov.il',
  'wilkinsonpc.com.co',
  'christianaid.org.uk',
  'abante.com.ph',
  'pap.gov.pk',
  'eltiempo.com.pe',
  'lanacion.com.py',
  'cic.gc.ca',
  'lancenet.com.br',
  'india.gov.in',
  'am.com.mx',
  'elpais.com.uy',
  'aksam.com.tr',
  'univalle.edu.co',
  'tasmc.org.il',
  'sudanwatch.blogspot.com',
  'acikradyo.com.tr',
  'metroradio.com.hk',
  'aja.com.pe',
 

In [20]:
df = pandas.DataFrame(data=d)

In [21]:
display(df)

Unnamed: 0,domain,match,topics
0,oem.com.mx,oem.com.mx,'NEWS'
1,lahora.com.ec,lahora.com.ec,'NEWS'
2,tog.org.tr,tog.org.tr,'HUMR'
3,gov.nt.ca,gov.nt.ca,'PUBH'
4,tcra.go.tz,tcra.go.tz,'GOVT'
...,...,...,...
4964,niteroi.rj.gov.br,gov.br,'NEWS'
4965,seguranca.sp.gov.br,gov.br,'NEWS'
4966,mp.rj.gov.br,gov.br,'NEWS'
4967,old.homeoffice.gov.uk,gov.uk,'NEWS'


In [22]:
display(df[df['topics'] == "''"])

Unnamed: 0,domain,match,topics


In [23]:
df.to_csv('AOL-treated-Citizen-Lab-Classification-domain-match.csv')