# Google Topics v1 Classification Domain Matching

In [1]:
import pandas, re, os, multiprocessing, tqdm, tldextract

In [2]:
# Loads unique domains from AOL-treated dataset into aol DataFrame.
aol = pandas.read_csv('AOL-treated-unique-domains.csv',
                      low_memory=False, on_bad_lines='warn',
                      usecols=[1]).rename(columns={'0':'domain'})

In [3]:
# Loads Google Topics v1 classification into topics DataFrame.
topics = pandas.read_csv('Google-Topics-Classification-v1.txt',
                         index_col=False, low_memory=False, on_bad_lines='warn')

In [4]:
display(aol)

Unnamed: 0,domain
0,tintonfallsrecreation.com
1,elevationbaptist.org
2,localchurches.org
3,fabolousworld.com
4,majestictheatre.net
...,...
1300479,suresave.com
1300480,missouricosmo.com
1300481,dimpres.com
1300482,hempel.com


In [5]:
display(topics)

Unnamed: 0,domain,topics
0,037hdmovie.com,'12'
1,0gomovies.sh,"'12','43','45'"
2,1.nbryb.com,'1'
3,10.nbryb.com,"'1','243'"
4,1000kitap.com,"'100','253'"
...,...,...
9041,zus.pl,''
9042,zvierata.bazos.sk,'289'
9043,zvirata.bazos.cz,'289'
9044,zyadda.com,"'1','254'"


In [6]:
aol_unique = set(aol['domain'].to_list())

In [7]:
topics_unique = set(topics['domain'].to_list())

In [8]:
display(len(aol_unique.intersection(topics_unique)))

1545

In [9]:
%%time
size_1 = []
size_2 = []
size_3 = []
size_4 = []
size_5 = []
check = []

# Separate domains from AOL-treated dataset according to number of levels.
# For instance, 'gov.br' into size_2, 'saude.gov.br' into size_3.
for case in aol_unique:
    count = len(re.findall(r'\.', case))
    if count == 0:
        size_1.append(case)
    elif count == 1:
        size_2.append(case)
    elif count == 2:
        size_3.append(case)
    elif count == 3:
        size_4.append(case)
    elif count == 4:
        size_5.append(case)
    else:
        check.append(case)

print(len(size_1),len(size_2),len(size_3),len(size_4),len(size_5),len(check))

0 1194526 99442 6494 22 0
CPU times: user 1.73 s, sys: 8.36 ms, total: 1.73 s
Wall time: 1.73 s


In [10]:
%%time
# Takes only domains with three or more levels.
aol_unique_reduced = []
aol_unique_reduced.extend(size_3)
aol_unique_reduced.extend(size_4)
aol_unique_reduced.extend(size_5)

CPU times: user 1.98 ms, sys: 3.32 ms, total: 5.29 ms
Wall time: 5.14 ms


In [11]:
print(len(aol_unique_reduced))

105958


## Multiprocessing

In [12]:
# Setup of multiprocessing library for multithreaded domain matching.
data1 = None
data2 = None

def init(_data1,_data2):
    global data1
    data1 = _data1
    global data2
    data2 = _data2

pool_size = multiprocessing.cpu_count() * 1

os.system('taskset -cp 0-%d %s' % (pool_size, os.getpid()))

jobs_count = pool_size * 10

n = round(len(aol_unique_reduced)/jobs_count)
l = [list(aol_unique_reduced)[i:i + n] for i in range(0, len(list(aol_unique_reduced)), n)]

def job(l):
    topics_unique = set(topics['domain'].to_list())
    topics_unique_re = [re.compile('.*\.'+case) for case in topics_unique]
    full = []
    partial = {}
    for dom in l:
        # Appends domain from AOL-treated dataset if also in the aggregated Citizen Lab test list.
        if dom in topics_unique:
            full.append(dom)
        # Tries to perform a partial match otherwise, i.e. map a subdomain to the respective domain.
        # For instance, 'covid.saude.gov.br' would partially match 'saude.gov.br'.
        else:
            partial[dom] = [case.pattern.strip('.*\\.') for case in topics_unique_re if any(map(case.search, [dom])) == True]
    return(full, partial)

with multiprocessing.Pool(processes=pool_size, maxtasksperchild=1, initializer=init,
                          initargs=(topics[['domain','topics']])) as pool:
    pool_outputs = list(tqdm.tqdm(pool.imap(job, l),
                                  total=sum(1 for _ in l)))
    pool.close()
    pool.join()

pid 1089221's current affinity list: 0-39
pid 1089221's new affinity list: 0-39


100%|████████████████████| 400/400 [01:12<00:00,  5.53it/s]


In [13]:
%%time
# Collects multiprocessing results.
full_match = []
partial_match = {}
for output in pool_outputs:
    if len(output[0]) > 0:
        full_match.extend(output[0])
    for k, v in output[1].items():
        if len(v) > 0:
            partial_match[k] = v
full_match.extend(list(set(size_2).intersection(topics_unique)))

CPU times: user 351 ms, sys: 40 ms, total: 391 ms
Wall time: 383 ms


In [14]:
print(len(full_match), len(partial_match))

1545 1274


In [15]:
display(full_match)

['officedepot.com.mx',
 'google.com.br',
 'google.co.nz',
 'independent.co.uk',
 'unionbankonline.co.in',
 'dominos.co.uk',
 'aldi.co.uk',
 'currys.co.uk',
 'google.co.kr',
 'google.com.hk',
 'o2.co.uk',
 'news.com.au',
 'mercadolibre.com.uy',
 'wickes.co.uk',
 'mcdonalds.co.jp',
 'jobs.nhs.uk',
 'bbc.co.uk',
 'stuff.co.nz',
 'submarino.com.br',
 'boticario.com.br',
 'walla.co.il',
 'wiggle.co.uk',
 'mercadolibre.com.ar',
 'pets4homes.co.uk',
 'imovelweb.com.br',
 'kanald.com.tr',
 'google.com.sa',
 'sams.com.mx',
 'kalunga.com.br',
 'bunnings.com.au',
 'google.com.pe',
 'commbank.com.au',
 'thestar.com.my',
 'google.com.co',
 'thetimes.co.uk',
 'ofuxico.com.br',
 'dailystar.co.uk',
 'lagaceta.com.ar',
 'jbhifi.com.au',
 'ntv.co.jp',
 'uol.com.br',
 'ebay.co.uk',
 'trademe.co.nz',
 'clicrbs.com.br',
 'ciudad.com.ar',
 'fanatik.com.tr',
 'pagina12.com.ar',
 'vivastreet.co.uk',
 'nld.com.vn',
 'costco.com.mx',
 'iceland.co.uk',
 'divyabhaskar.co.in',
 'bharian.com.my',
 'ntv.com.tr',
 't

In [16]:
display(partial_match)

{'ciberhabitat.gob.mx': ['gob.mx'],
 'sath.nhs.uk': ['nhs.uk'],
 'asambleadf.gob.mx': ['gob.mx'],
 'sheffield-ha.nhs.uk': ['nhs.uk'],
 'wmids.nhs.uk': ['nhs.uk'],
 'bcu.gub.uy': ['gub.uy'],
 'london.nhs.uk': ['nhs.uk'],
 'nwheartaudit.nhs.uk': ['nhs.uk'],
 'whitehorsedc.gov.uk': ['gov.uk'],
 'esonora.gob.mx': ['gob.mx'],
 'lincoln.gov.uk': ['gov.uk'],
 'wokingham.gov.uk': ['gov.uk'],
 'osha.nhs.uk': ['nhs.uk'],
 'mountedgcumbe.gov.uk': ['gov.uk'],
 'nelincs.gov.uk': ['gov.uk'],
 'delegacionbenitojuarez.gob.mx': ['gob.mx'],
 'tabasco.gob.mx': ['gob.mx'],
 'agwsha.nhs.uk': ['nhs.uk'],
 'orlc.gob.pe': ['gob.pe'],
 'contactopyme.gob.mx': ['gob.mx'],
 'networks.nhs.uk': ['nhs.uk'],
 'pas.gov.uk': ['gov.uk'],
 'seebc.gob.mx': ['gob.mx'],
 'sedesol.gob.mx': ['gob.mx'],
 'chester.gov.uk': ['gov.uk'],
 'housingcorp.gov.uk': ['gov.uk'],
 'mcga.gov.uk': ['gov.uk'],
 'legalservices.gov.uk': ['gov.uk'],
 'staffordshire.gov.uk': ['gov.uk'],
 'scotlandspeople.gov.uk': ['gov.uk'],
 'edomexico.gob.mx':

In [17]:
%%time
# Checks if any partial match has matched more than one domain.
for k, v in partial_match.items():
    if len(v) != 1:
        print(v)

CPU times: user 258 µs, sys: 210 µs, total: 468 µs
Wall time: 475 µs


In [18]:
%%time
# Organizes data into a dictionary to be loaded into a DataFrame.
d = {'domain': [], 'match': [], 'topics': []}
for case in full_match:
    d['domain'].append(case)
    d['match'].append(case)
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])
for k, v in partial_match.items():
    d['domain'].append(k)
    d['match'].append(v[0])
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])

CPU times: user 3.2 s, sys: 0 ns, total: 3.2 s
Wall time: 3.2 s


In [19]:
display(d)

{'domain': ['officedepot.com.mx',
  'google.com.br',
  'google.co.nz',
  'independent.co.uk',
  'unionbankonline.co.in',
  'dominos.co.uk',
  'aldi.co.uk',
  'currys.co.uk',
  'google.co.kr',
  'google.com.hk',
  'o2.co.uk',
  'news.com.au',
  'mercadolibre.com.uy',
  'wickes.co.uk',
  'mcdonalds.co.jp',
  'jobs.nhs.uk',
  'bbc.co.uk',
  'stuff.co.nz',
  'submarino.com.br',
  'boticario.com.br',
  'walla.co.il',
  'wiggle.co.uk',
  'mercadolibre.com.ar',
  'pets4homes.co.uk',
  'imovelweb.com.br',
  'kanald.com.tr',
  'google.com.sa',
  'sams.com.mx',
  'kalunga.com.br',
  'bunnings.com.au',
  'google.com.pe',
  'commbank.com.au',
  'thestar.com.my',
  'google.com.co',
  'thetimes.co.uk',
  'ofuxico.com.br',
  'dailystar.co.uk',
  'lagaceta.com.ar',
  'jbhifi.com.au',
  'ntv.co.jp',
  'uol.com.br',
  'ebay.co.uk',
  'trademe.co.nz',
  'clicrbs.com.br',
  'ciudad.com.ar',
  'fanatik.com.tr',
  'pagina12.com.ar',
  'vivastreet.co.uk',
  'nld.com.vn',
  'costco.com.mx',
  'iceland.co.uk',

In [20]:
df = pandas.DataFrame(data=d)

In [21]:
display(df)

Unnamed: 0,domain,match,topics
0,officedepot.com.mx,officedepot.com.mx,"'103','289'"
1,google.com.br,google.com.br,'219'
2,google.co.nz,google.co.nz,'219'
3,independent.co.uk,independent.co.uk,'243'
4,unionbankonline.co.in,unionbankonline.co.in,'149'
...,...,...,...
2814,metro.sp.gov.br,gov.br,"'243','299'"
2815,seplan.go.gov.br,gov.br,"'243','299'"
2816,polmil.sp.gov.br,gov.br,"'243','299'"
2817,hyde.tameside.sch.uk,t.me,"'243','299'"


In [22]:
# Checks for domains without a topic.
display(df[df['topics'] == "''"])

Unnamed: 0,domain,match,topics
90,sonhos.com.br,sonhos.com.br,''
94,consultaremedios.com.br,consultaremedios.com.br,''
133,casadoscontos.com.br,casadoscontos.com.br,''
145,nationallottery.co.za,nationallottery.co.za,''
231,thumbzilla.com,thumbzilla.com,''
...,...,...,...
1458,bet365.com,bet365.com,''
1459,clips4sale.com,clips4sale.com,''
1477,islamweb.net,islamweb.net,''
1522,porzo.com,porzo.com,''


In [23]:
# Drops domains without a topic.
df = df.drop(df[df['topics'] == "''"].index.to_list())

In [24]:
df = df.reset_index(drop=True)

In [25]:
display(df)

Unnamed: 0,domain,match,topics
0,officedepot.com.mx,officedepot.com.mx,"'103','289'"
1,google.com.br,google.com.br,'219'
2,google.co.nz,google.co.nz,'219'
3,independent.co.uk,independent.co.uk,'243'
4,unionbankonline.co.in,unionbankonline.co.in,'149'
...,...,...,...
2713,metro.sp.gov.br,gov.br,"'243','299'"
2714,seplan.go.gov.br,gov.br,"'243','299'"
2715,polmil.sp.gov.br,gov.br,"'243','299'"
2716,hyde.tameside.sch.uk,t.me,"'243','299'"


In [26]:
df.to_csv('AOL-treated-Google-Topics-Classification-v1-domain-match.csv')