# Citizen Lab Classification Data Treatment

In [1]:
import pandas, re, tldextract, requests
from urllib.parse import urlparse

## Filtering by eTLD

Based on `tldextract` and on Mozilla's Public Suffix List.

In [2]:
extra_suffixes = ['bg.ac.yu','ac.yu','cg.yu','co.yu','edu.yu','gov.yu','net.yu','org.yu','yu','or.tp','tp','an']
extract = tldextract.TLDExtract(suffix_list_urls=["https://raw.githubusercontent.com/publicsuffix/list/5e6ac3a082505ac4cf08858bdb38382d9a912833/public_suffix_list.dat"],
                                cache_dir=None,fallback_to_snapshot=False,
                                extra_suffixes=extra_suffixes,include_psl_private_domains=True)

## Load Citizen Lab Classification

In [3]:
%%time
data = pandas.read_csv('https://raw.githubusercontent.com/citizenlab/test-lists/ebd0ee8d41977b381972b2f6c471af5437d8d015/lists/global.csv',
                       low_memory=False, on_bad_lines='warn', sep=',')

data = data.dropna(subset=['url'])
data = data.reset_index(drop=True)
data['domain'] = data['url']

for row in data.itertuples():
    temp = urlparse(row.url)
    if temp.netloc != '':
        data.at[row.Index, 'domain'] = temp.netloc
    else:
        data.at[row.Index, 'domain'] = temp.path

check = []

for tup in data.itertuples():
    temp = extract(tup.domain)
    if temp.suffix == '':
        check.append(tup.Index)
    elif (temp.domain == '') and (temp.suffix.count('.') == 0):
        check.append(tup.Index)
    elif (temp.domain == '') and (temp.suffix.count('.') != 0):
        data.at[tup.Index,'domain'] = temp.suffix
    elif (re.search(r'^[w]+$',temp.domain) != None) and (temp.suffix.count('.') != 0):
        data.at[tup.Index,'domain'] = temp.suffix
    elif re.search(r'^[w]+$',temp.domain) == None:
        if re.search(r'[^a-zA-Z0-9-]',temp.domain) != None:
            check.append(tup.Index)
        else:
            data.at[tup.Index,'domain'] = temp.domain.strip('-') + '.' + temp.suffix
    else:
        check.append(tup.Index)

data = data.drop(check)
data = data.drop_duplicates()
data = data.reset_index(drop=True)

CPU times: user 482 ms, sys: 14.2 ms, total: 496 ms
Wall time: 975 ms


In [4]:
complete_data = data.copy(deep=True)

In [5]:
%%time
countries = pandas.read_csv('https://raw.githubusercontent.com/citizenlab/test-lists/ebd0ee8d41977b381972b2f6c471af5437d8d015/lists/00-LEGEND-country_codes.csv',
                            low_memory=False, on_bad_lines='warn', sep=',', keep_default_na=False)['CountryCode']

lists = [
    'official/it/aams.csv',
    'official/it/bofh.csv',
    'cis.csv',
    'cd.csv',
    'mk.csv',
    'ps.csv',
    'rs.csv',
    'ss.csv',
    'tl.csv'
]

base = 'https://raw.githubusercontent.com/citizenlab/test-lists/ebd0ee8d41977b381972b2f6c471af5437d8d015/lists/'

for country in countries:
    test = requests.get(base + country.lower() + '.csv').status_code
    if test == 200:
        lists.append(country.lower() + '.csv')
    else:
        print(country + ': ' + str(test))

AD: 404
AG: 404
AI: 404
AN: 404
AQ: 404
AS: 404
AW: 404
BB: 404
BF: 404
BJ: 404
BM: 404
BN: 404
BS: 404
BT: 404
BU: 404
BV: 404
BW: 404
BZ: 404
CC: 404
CF: 404
CG: 404
CK: 404
CS: 404
CV: 404
CX: 404
DD: 404
DJ: 404
DM: 404
EH: 404
FK: 404
FM: 404
FO: 404
FX: 404
GD: 404
GF: 404
GI: 404
GL: 404
GN: 404
GP: 404
GQ: 404
GS: 404
GU: 404
GW: 404
HM: 404
HT: 404
IO: 404
JM: 404
KI: 404
KM: 404
KN: 404
KY: 404
LA: 404
LC: 404
LI: 404
LR: 404
LU: 404
MC: 404
MG: 404
MH: 404
MO: 404
MP: 404
MQ: 404
MS: 404
MT: 404
MU: 404
MV: 404
MW: 404
NA: 404
NC: 404
NE: 404
NR: 404
NT: 404
NU: 404
PF: 404
PG: 404
PM: 404
PN: 404
PW: 404
RE: 404
SB: 404
SC: 404
SH: 404
SJ: 404
SM: 404
SR: 404
ST: 404
SU: 404
SZ: 404
TC: 404
TF: 404
TK: 404
TO: 404
TP: 404
TT: 404
TV: 404
UM: 404
VA: 404
VC: 404
VG: 404
VI: 404
VU: 404
WF: 404
WS: 404
YD: 404
YT: 404
YU: 404
ZR: 404
CPU times: user 13.1 s, sys: 266 ms, total: 13.4 s
Wall time: 1min 6s


In [6]:
%%time
base = 'https://raw.githubusercontent.com/citizenlab/test-lists/ebd0ee8d41977b381972b2f6c471af5437d8d015/lists/'

for file in lists:
    data = pandas.read_csv(base + file, low_memory=False, on_bad_lines='warn', sep=',')
    
    data = data.dropna(subset=['url'])
    data = data.reset_index(drop=True)
    data['domain'] = data['url']
    
    for row in data.itertuples():
        temp = urlparse(row.url)
        if temp.netloc != '':
            data.at[row.Index, 'domain'] = temp.netloc
        else:
            data.at[row.Index, 'domain'] = temp.path
    
    check = []
    
    for tup in data.itertuples():
        temp = extract(tup.domain)
        if temp.suffix == '':
            check.append(tup.Index)
        elif (temp.domain == '') and (temp.suffix.count('.') == 0):
            check.append(tup.Index)
        elif (temp.domain == '') and (temp.suffix.count('.') != 0):
            data.at[tup.Index,'domain'] = temp.suffix
        elif (re.search(r'^[w]+$',temp.domain) != None) and (temp.suffix.count('.') != 0):
            data.at[tup.Index,'domain'] = temp.suffix
        elif re.search(r'^[w]+$',temp.domain) == None:
            if re.search(r'[^a-zA-Z0-9-]',temp.domain) != None:
                check.append(tup.Index)
            else:
                data.at[tup.Index,'domain'] = temp.domain.strip('-') + '.' + temp.suffix
        else:
            check.append(tup.Index)
    
    data = data.drop(check)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    
    complete_data = pandas.concat([complete_data, data])
    complete_data = complete_data.reset_index(drop=True)

CPU times: user 18.6 s, sys: 669 ms, total: 19.2 s
Wall time: 49.9 s


## Drop unnecessary columns

In [7]:
%%time
complete_data = complete_data.drop(columns=['category_description', 'date_added', 'notes', 'name',
                                            'date_published', 'data_format_version', 'authority'],
                                   inplace=False)

CPU times: user 8.54 ms, sys: 0 ns, total: 8.54 ms
Wall time: 13.1 ms


In [8]:
display(complete_data)

Unnamed: 0,url,category_code,source,domain
0,https://4genderjustice.org/,HUMR,citizenlab,4genderjustice.org
1,http://666games.net/,GAME,citizenlab,666games.net
2,http://abpr2.railfan.net/,MMED,citizenlab,railfan.net
3,https://addons.mozilla.org/,FILE,citizenlab,mozilla.org
4,https://adium.im/,GRP,citizenlab,adium.im
...,...,...,...,...
41180,https://outrightinternational.org/sites/defaul...,LGBT,Netalitica,outrightinternational.org
41181,https://www.globalgayz.com/gay-zimbabwe/158/,LGBT,Netalitica,globalgayz.com
41182,https://www.topup.co.zw/,COMM,test-lists.ooni.org contribution,topup.co.zw
41183,https://www.paynow.co.zw/,COMM,test-lists.ooni.org contribution,paynow.co.zw


In [9]:
display(complete_data.nunique())

url              33842
category_code       34
source             158
domain           27793
dtype: int64

## Drop rows without classification

In [10]:
display(complete_data['domain'].isna().any())

False

In [11]:
display(complete_data['category_code'].isna().any())

True

In [12]:
%%time
for row in complete_data.itertuples():
    if type(row.category_code) != type(str()):
        print(row)

Pandas(Index=4490, url='www.dolomitihomes.it', category_code=nan, source='censura.bofh.it', domain='dolomitihomes.it')
CPU times: user 101 ms, sys: 4.05 ms, total: 106 ms
Wall time: 113 ms


In [13]:
%%time
complete_data = complete_data.dropna(subset=['category_code'], ignore_index=True)

CPU times: user 14.5 ms, sys: 101 µs, total: 14.6 ms
Wall time: 21.9 ms


## Update all entries to new classification

As per: https://github.com/citizenlab/test-lists/blob/ebd0ee8d41977b381972b2f6c471af5437d8d015/lists/00-LEGEND-new_category_codes.csv

In [14]:
display(complete_data[['category_code','domain']].groupby('category_code').indices.keys())

dict_keys(['ALDR', 'ANON', 'COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'DEV', 'ECON', 'ENV', 'FEXP', 'FILE', 'GAME', 'GMB', 'GOVT', 'GRP', 'HACK', 'HATE', 'HOST', 'HUMR', 'IGO', 'LGBT', 'MILX', 'MISC', 'MMED', 'NEWS', 'P2P', 'POLR', 'PORN', 'PROV', 'PUBH', 'REL', 'SRCH', 'XED'])

In [15]:
%%time
for row in complete_data.itertuples():
    if row.category_code == 'DEV':
        complete_data.at[row.Index,'category_code'] = 'ECON'
    elif row.category_code == 'FEXP':
        complete_data.at[row.Index,'category_code'] = 'NEWS'
    elif row.category_code == 'P2P':
        complete_data.at[row.Index,'category_code'] = 'FILE'

CPU times: user 96.9 ms, sys: 0 ns, total: 96.9 ms
Wall time: 99.2 ms


In [16]:
display(complete_data[['category_code','domain']].groupby('category_code').indices.keys())

dict_keys(['ALDR', 'ANON', 'COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ECON', 'ENV', 'FILE', 'GAME', 'GMB', 'GOVT', 'GRP', 'HACK', 'HATE', 'HOST', 'HUMR', 'IGO', 'LGBT', 'MILX', 'MISC', 'MMED', 'NEWS', 'POLR', 'PORN', 'PROV', 'PUBH', 'REL', 'SRCH', 'XED'])

In [17]:
display(complete_data)

Unnamed: 0,url,category_code,source,domain
0,https://4genderjustice.org/,HUMR,citizenlab,4genderjustice.org
1,http://666games.net/,GAME,citizenlab,666games.net
2,http://abpr2.railfan.net/,MMED,citizenlab,railfan.net
3,https://addons.mozilla.org/,FILE,citizenlab,mozilla.org
4,https://adium.im/,GRP,citizenlab,adium.im
...,...,...,...,...
41179,https://outrightinternational.org/sites/defaul...,LGBT,Netalitica,outrightinternational.org
41180,https://www.globalgayz.com/gay-zimbabwe/158/,LGBT,Netalitica,globalgayz.com
41181,https://www.topup.co.zw/,COMM,test-lists.ooni.org contribution,topup.co.zw
41182,https://www.paynow.co.zw/,COMM,test-lists.ooni.org contribution,paynow.co.zw


In [18]:
display(complete_data.nunique())

url              33841
category_code       31
source             158
domain           27792
dtype: int64

## Merge classifications by domain

In [19]:
%%time
topics = {}
for domain, rows in complete_data[['domain','category_code']].groupby('domain').groups.items():
    temp = []
    for row in rows:
        temp.append(complete_data.iloc[row]['category_code'])
    topics[domain] = str(set(temp)).strip('{}')

CPU times: user 5.48 s, sys: 15.7 ms, total: 5.49 s
Wall time: 5.49 s


In [20]:
display(topics)

{'000webhost.com': "'HOST'",
 '0039bet.com': "'GMB'",
 '0039casino.com': "'GMB'",
 '0039casino.eu': "'GMB'",
 '003ms.ru': "'PUBH'",
 '007bets.com': "'GMB'",
 '007casinogames.com': "'GMB'",
 '007sportsbetting.com': "'GMB'",
 '00roulette.com': "'GMB'",
 '015.by': "'SRCH'",
 '01net.com': "'HACK', 'HOST'",
 '022.md': "'HOST'",
 '027pxw.com': "'COMM'",
 '03.ru': "'PUBH'",
 '037hdmovie.com': "'CULTR'",
 '03portal.kz': "'COMM'",
 '06237.info': "'NEWS'",
 '06239.com.ua': "'NEWS'",
 '06252.com.ua': "'NEWS'",
 '06274.com.ua': "'NEWS'",
 '07sports.com': "'GMB'",
 '0dt.net': "'FILE'",
 '0eb.net': "'FILE'",
 '0ubada.net': "'FILE'",
 '1-0-bet.com': "'GMB'",
 '1-0-bet.it': "'GMB'",
 '1-x-bet.com': "'GMB'",
 '10-bet.it': "'GMB'",
 '10000euroalmese.info': "'GMB'",
 '1000noticias.com.py': "'NEWS'",
 '1000poker.net': "'GMB'",
 '1001casino.com': "'GMB'",
 '1001roteirinhos.com.br': "'CULTR'",
 '100basket.com': "'COMM'",
 '100kcasino.com': "'GMB'",
 '100kwt.com': "'NEWS'",
 '100mega.ml': "'POLR'",
 '100noti

In [21]:
data = pandas.DataFrame.from_dict(topics, orient='index', columns=['topics']).reset_index(names=['domain'])

In [22]:
display(data)

Unnamed: 0,domain,topics
0,000webhost.com,'HOST'
1,0039bet.com,'GMB'
2,0039casino.com,'GMB'
3,0039casino.eu,'GMB'
4,003ms.ru,'PUBH'
...,...,...
27787,zwnews.com,'NEWS'
27788,zwwebdirectory.com,'NEWS'
27789,zyzoom.net,'ANON'
27790,zzgays.com,'PORN'


## Save to file

In [23]:
data.to_csv('Citizen-Lab-Classification.csv', index=True)