# Google Topics v1 Classification Domain Matching

In [1]:
import pandas, re, os, multiprocessing, tqdm, tldextract

In [2]:
%%time
aol = pandas.read_csv('AOL-treated-unique-domains.csv',
                      low_memory=False, on_bad_lines='warn',
                      usecols=[1]).rename(columns={'0':'domain'})

CPU times: user 696 ms, sys: 83.4 ms, total: 779 ms
Wall time: 777 ms


In [3]:
%%time
topics = pandas.read_csv('Google-Topics-Classification-v1.txt',
                         index_col=False, low_memory=False, on_bad_lines='warn')

CPU times: user 5.68 ms, sys: 1.99 ms, total: 7.67 ms
Wall time: 6.91 ms


In [4]:
display(aol)

Unnamed: 0,domain
0,martinpeak.com
1,mohpa.com
2,museum.wa.gov.au
3,richardiiimuseum.co.uk
4,westgaappraisals.com
...,...
1300479,17designs.com
1300480,ebbro.com
1300481,goldenstatefence.com
1300482,allbabyluv.blogspot.com


In [5]:
display(topics)

Unnamed: 0,domain,topics
0,037hdmovie.com,'12'
1,0gomovies.sh,"'12','43','45'"
2,1.nbryb.com,'1'
3,10.nbryb.com,"'1','243'"
4,1000kitap.com,"'100','253'"
...,...,...
9041,zus.pl,''
9042,zvierata.bazos.sk,'289'
9043,zvirata.bazos.cz,'289'
9044,zyadda.com,"'1','254'"


In [6]:
aol_unique = set(aol['domain'].to_list())

In [7]:
topics_unique = set(topics['domain'].to_list())

In [8]:
display(len(aol_unique.intersection(topics_unique)))

1545

In [9]:
%%time
size_1 = []
size_2 = []
size_3 = []
size_4 = []
size_5 = []
check = []

for case in aol_unique:
    count = len(re.findall(r'\.', case))
    if count == 0:
        size_1.append(case)
    elif count == 1:
        size_2.append(case)
    elif count == 2:
        size_3.append(case)
    elif count == 3:
        size_4.append(case)
    elif count == 4:
        size_5.append(case)
    else:
        check.append(case)

print(len(size_1),len(size_2),len(size_3),len(size_4),len(size_5),len(check))

0 1194526 99442 6494 22 0
CPU times: user 1.1 s, sys: 0 ns, total: 1.1 s
Wall time: 1.1 s


In [10]:
%%time
aol_unique_reduced = []
aol_unique_reduced.extend(size_3)
aol_unique_reduced.extend(size_4)
aol_unique_reduced.extend(size_5)

CPU times: user 2.15 ms, sys: 0 ns, total: 2.15 ms
Wall time: 2.16 ms


In [11]:
print(len(aol_unique_reduced))

105958


## Multiprocessing

In [12]:
data1 = None
data2 = None

def init(_data1,_data2):
    global data1
    data1 = _data1
    global data2
    data2 = _data2

pool_size = multiprocessing.cpu_count() * 1

os.system('taskset -cp 0-%d %s' % (pool_size, os.getpid()))

jobs_count = pool_size * 100

n = round(len(aol_unique_reduced)/jobs_count)
l = [list(aol_unique_reduced)[i:i + n] for i in range(0, len(list(aol_unique_reduced)), n)]

def job(l):
    topics_unique = set(topics['domain'].to_list())
    full = []
    partial = {}
    for dom in l:
        if dom in topics_unique:
            full.append(dom)
        else:
            partial[dom] = []
            for case in topics_unique:
                if re.search(r'.*\.'+case, dom) != None:
                    partial[dom].append(case)
    return(full, partial)

with multiprocessing.Pool(processes=pool_size, maxtasksperchild=1, initializer=init,
                          initargs=(topics[['domain','topics']])) as pool:
    pool_outputs = list(tqdm.tqdm(pool.imap(job, l),
                                  total=sum(1 for _ in l)))
    pool.close()
    pool.join()

pid 1820799's current affinity list: 0-39
pid 1820799's new affinity list: 0-39


100%|██████████████████| 4076/4076 [32:26<00:00,  2.09it/s]


In [13]:
%%time
full_match = []
partial_match = {}
for output in pool_outputs:
    if len(output[0]) > 0:
        full_match.extend(output[0])
    for k, v in output[1].items():
        if len(v) > 0:
            partial_match[k] = v
full_match.extend(list(set(size_2).intersection(topics_unique)))

CPU times: user 353 ms, sys: 60.9 ms, total: 414 ms
Wall time: 410 ms


In [14]:
print(len(full_match), len(partial_match))

1545 1274


In [15]:
display(full_match)

['computrabajo.com.mx',
 'walla.co.il',
 'indiapost.gov.in',
 'makro.co.za',
 'lagaceta.com.ar',
 'mercadolibre.com.uy',
 'google.com.sa',
 'boticario.com.br',
 'ciudad.com.ar',
 'voegol.com.br',
 'ofuxico.com.br',
 '24h.com.vn',
 'clicrbs.com.br',
 'google.co.kr',
 'indianrail.gov.in',
 'jang.com.pk',
 'casasbahia.com.br',
 'hmv.co.jp',
 'amazon.co.jp',
 'google.com.tw',
 'seek.com.au',
 'mercadolibre.com.mx',
 'google.com.hk',
 'bunnings.com.au',
 'jdsports.co.uk',
 'vivastreet.co.uk',
 'google.co.ve',
 'google.com.my',
 'kmart.com.au',
 'dailymail.co.uk',
 'dominos.co.uk',
 'homedepot.com.mx',
 'mercadolibre.com.ar',
 'nationallottery.co.za',
 'spareroom.co.uk',
 'jbhifi.com.au',
 'reed.co.uk',
 'matalan.co.uk',
 'aldi.co.uk',
 'abc.com.py',
 'independent.co.uk',
 'stuff.co.nz',
 'excelsior.com.mx',
 'sabah.com.tr',
 'rightmove.co.uk',
 'consultaremedios.com.br',
 'pravda.com.ua',
 'fanatik.com.tr',
 'google.co.za',
 'metro.co.uk',
 'caixa.gov.br',
 'kalunga.com.br',
 'tripadvisor.c

In [16]:
display(partial_match)

{'gateshead.gov.uk': ['gov.uk'],
 'firekills.gov.uk': ['gov.uk'],
 'kent.gov.uk': ['gov.uk'],
 'wales.nhs.uk': ['nhs.uk'],
 'gloucestershire.gov.uk': ['gov.uk'],
 'odpm.gov.uk': ['gov.uk'],
 'pjetam.gob.mx': ['gob.mx'],
 'sentencing-guidelines.gov.uk': ['gov.uk'],
 'doeni.gov.uk': ['gov.uk'],
 'shrewsbury.gov.uk': ['gov.uk'],
 'scot.nhs.uk': ['nhs.uk'],
 'enriquegobernador.gob.mx': ['gob.mx'],
 'childcarecareers.gov.uk': ['gov.uk'],
 'metepec.gob.mx': ['gob.mx'],
 'enfield.gov.uk': ['gov.uk'],
 'lgiu.gov.uk': ['gov.uk'],
 'curriculumonline.gov.uk': ['gov.uk'],
 'voa.gov.uk': ['gov.uk'],
 'munihuacho.gob.pe': ['gob.pe'],
 'nhsweb.nhs.uk': ['nhs.uk'],
 'ngfl.gov.uk': ['gov.uk'],
 'peoplesnetwork.gov.uk': ['gov.uk'],
 'guiaweb.gob.cl': ['gob.cl'],
 'tiger.gov.uk': ['gov.uk'],
 'mgap.gub.uy': ['gub.uy'],
 'windsor.gov.uk': ['gov.uk'],
 'farrier-reg.gov.uk': ['gov.uk'],
 'civilserviceappealboard.gov.uk': ['gov.uk'],
 'essexcc.gov.uk': ['gov.uk'],
 'cnca.gob.mx': ['gob.mx'],
 'meto.gov.uk': 

In [17]:
%%time
for k, v in partial_match.items():
    if len(v) != 1:
        print(v)

CPU times: user 0 ns, sys: 327 µs, total: 327 µs
Wall time: 337 µs


In [18]:
%%time
d = {'domain': [], 'match': [], 'topics': []}
for case in full_match:
    d['domain'].append(case)
    d['match'].append(case)
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])
for k, v in partial_match.items():
    d['domain'].append(k)
    d['match'].append(v[0])
    d['topics'].append(topics[topics['domain'] == case]['topics'].to_list()[0])

CPU times: user 2.65 s, sys: 0 ns, total: 2.65 s
Wall time: 2.63 s


In [19]:
display(d)

{'domain': ['computrabajo.com.mx',
  'walla.co.il',
  'indiapost.gov.in',
  'makro.co.za',
  'lagaceta.com.ar',
  'mercadolibre.com.uy',
  'google.com.sa',
  'boticario.com.br',
  'ciudad.com.ar',
  'voegol.com.br',
  'ofuxico.com.br',
  '24h.com.vn',
  'clicrbs.com.br',
  'google.co.kr',
  'indianrail.gov.in',
  'jang.com.pk',
  'casasbahia.com.br',
  'hmv.co.jp',
  'amazon.co.jp',
  'google.com.tw',
  'seek.com.au',
  'mercadolibre.com.mx',
  'google.com.hk',
  'bunnings.com.au',
  'jdsports.co.uk',
  'vivastreet.co.uk',
  'google.co.ve',
  'google.com.my',
  'kmart.com.au',
  'dailymail.co.uk',
  'dominos.co.uk',
  'homedepot.com.mx',
  'mercadolibre.com.ar',
  'nationallottery.co.za',
  'spareroom.co.uk',
  'jbhifi.com.au',
  'reed.co.uk',
  'matalan.co.uk',
  'aldi.co.uk',
  'abc.com.py',
  'independent.co.uk',
  'stuff.co.nz',
  'excelsior.com.mx',
  'sabah.com.tr',
  'rightmove.co.uk',
  'consultaremedios.com.br',
  'pravda.com.ua',
  'fanatik.com.tr',
  'google.co.za',
  'metro

In [20]:
df = pandas.DataFrame(data=d)

In [21]:
display(df)

Unnamed: 0,domain,match,topics
0,computrabajo.com.mx,computrabajo.com.mx,'238'
1,walla.co.il,walla.co.il,"'1','215','219','243'"
2,indiapost.gov.in,indiapost.gov.in,'103'
3,makro.co.za,makro.co.za,'289'
4,lagaceta.com.ar,lagaceta.com.ar,'243'
...,...,...,...
2814,prefeitura.sp.gov.br,gov.br,'243'
2815,sespa.pa.gov.br,gov.br,'243'
2816,camaranh.rs.gov.br,gov.br,'243'
2817,sjc.sp.gov.br,gov.br,'243'


In [22]:
display(df[df['topics'] == "''"])

Unnamed: 0,domain,match,topics
33,nationallottery.co.za,nationallottery.co.za,''
45,consultaremedios.com.br,consultaremedios.com.br,''
113,sonhos.com.br,sonhos.com.br,''
207,casadoscontos.com.br,casadoscontos.com.br,''
223,lotteryusa.com,lotteryusa.com,''
...,...,...,...
1505,churchofjesuschrist.org,churchofjesuschrist.org,''
1522,freesexyindians.com,freesexyindians.com,''
1529,astrology.com,astrology.com,''
1531,voyeurweb.com,voyeurweb.com,''


In [23]:
df = df.drop(df[df['topics'] == "''"].index.to_list())

In [24]:
df = df.reset_index(drop=True)

In [25]:
display(df)

Unnamed: 0,domain,match,topics
0,computrabajo.com.mx,computrabajo.com.mx,'238'
1,walla.co.il,walla.co.il,"'1','215','219','243'"
2,indiapost.gov.in,indiapost.gov.in,'103'
3,makro.co.za,makro.co.za,'289'
4,lagaceta.com.ar,lagaceta.com.ar,'243'
...,...,...,...
2713,prefeitura.sp.gov.br,gov.br,'243'
2714,sespa.pa.gov.br,gov.br,'243'
2715,camaranh.rs.gov.br,gov.br,'243'
2716,sjc.sp.gov.br,gov.br,'243'


In [26]:
df.to_csv('AOL-treated-Google-Topics-Classification-v1-domain-match.csv')