<a href="https://colab.research.google.com/github/mycaule/dd-assessment/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas
import numpy
import datetime
import urllib.request

In [0]:
#@title Top views from Wikimedia universe

year = "2020"  # @param ["2015", "2016", "2017", "2018", "2019", "2020"]
month = "03"   # @param ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
day = "12"     # @param ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
hour = "00"    # @param ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"]

blacklist_base_url = "https://s3.amazonaws.com/dd-interview-data/data_engineer/wikipedia"
pageviews_base_url = "https://dumps.wikimedia.org/other/pageviews"

### Downloading the files

In [0]:
blacklist_file = "blacklist_domains_and_pages"
pageviews_file = f"pageviews-{year}{month}{day}-{hour}0000.gz"

urllib.request.urlretrieve(f'{blacklist_base_url}/{blacklist_file}', blacklist_file)
urllib.request.urlretrieve(f'{pageviews_base_url}/{year}/{year}-{month}/{pageviews_file}', pageviews_file)

('pageviews-20200312-000000.gz', <http.client.HTTPMessage at 0x7f0fdbb19c50>)

In [0]:
!ls -lh

total 49M
-rw-r--r-- 1 root root 2.5M Jun 10 07:40 blacklist_domains_and_pages
-rw-r--r-- 1 root root  46M Jun 10 07:40 pageviews-20200312-000000.gz
drwxr-xr-x 1 root root 4.0K May 29 18:19 sample_data


### Loading the datasets using pandas

In [0]:
df1 = pandas.read_csv(blacklist_file, sep=' ', usecols=[0, 1], names=['domain', 'title'])
df1['title'] = df1['title'].apply(lambda x: x)
df1

Unnamed: 0,domain,title
0,ab,%D0%91%D0%BE%D1%80%D0%B8%D1%81_%D0%93%D1%80%D1...
1,ace,Beureukaih:Nuvola_apps_important.svg
2,ace,Japan
3,ace,Kusuih:Hubong_gisa/Ureu%C3%ABng_Ngui:Hoo_User_...
4,ace,Kusuih:Neuubah_meuhubong/Seunaleu%C3%ABk:Flag
...,...,...
57109,zh.s,File:Tbt-43.JPG
57110,zh.s,Template:Annotate
57111,zh.s,User:Klangtao
57112,zh.voy,File:The_stage_in_Xue_Fucheng_Residence.jpg


In [0]:
df2 = pandas.read_csv(pageviews_file, compression='gzip', sep=' ', usecols=[0, 1, 2], names=['domain', 'title', 'views'])
df2

Unnamed: 0,domain,title,views
0,aa,Main_Page,4
1,aa,Special:ListFiles,1
2,aa,Special:Search,1
3,aa,Special:Statistics,1
4,aa.b,Main_Page,3
...,...,...,...
5678422,zu.m,Ukuvuvukala_kwesibindi_A,1
5678423,zu.m,User:Shriheeran/2,5
5678424,zu.m,Yua_Mikami,3
5678425,zu.m.d,Ikhasi_Elikhulu,2


In [0]:
nbBlacklisted = len(df1.index)
nbPageviews = len(df2.index)
print(f"{nbPageviews} rows to process with {nbBlacklisted} blacklisted items")

5678427 rows to process with 57114 blacklisted items


### Domains with the most rows

In [29]:
print(df1.groupby('domain').title.count().sort_values(ascending=False))
print(df2.groupby('domain').title.count().sort_values(ascending=False))

groups = df2.domain.unique()

groups

domain
en        14345
en.m       4530
ru         2506
de         2266
fr         2069
          ...  
la.q          1
lad           1
lb.d          1
ab            1
bn.m.v        0
Name: title, Length: 756, dtype: int64
domain
en        1181392
en.m      1033041
es.m       285353
es         206807
ja.m       198124
           ...   
nn.m.q          1
no.m.n          1
no.n            1
gcr.m           1
mt.m.d          1
Name: title, Length: 1536, dtype: int64


array(['aa', 'aa.b', 'aa.d', ..., 'zu.d', 'zu.m', 'zu.m.d'], dtype=object)

### Top K elements

In [0]:
domains_study = ['en.m', 'es.m', 'ja.m', 'ru', 'de', 'fr'] # @param

In [30]:
print(f"Analysis of {pageviews_file}")

for domain in groups:
  df3 = df2.loc[(df2.domain == domain) & (~df2.title.isin(df3))].nlargest(25, ['views'])

  print(f"Domain name: {domain}")
  print(len(df3[['title', 'views']].index))

Analysis of pageviews-20200312-000000.gz
Domain name: aa
4
Domain name: aa.b
3
Domain name: aa.d
3
Domain name: ab
25
Domain name: ab.m
7
Domain name: ace
25
Domain name: ace.m
22
Domain name: advisory.m
1
Domain name: ady
7
Domain name: ady.m
2
Domain name: af
25
Domain name: af.b
1
Domain name: af.d
23
Domain name: af.m
25
Domain name: af.m.d
25
Domain name: af.q
2
Domain name: ak
13
Domain name: ak.m
6
Domain name: als
25
Domain name: als.m
25
Domain name: am
25
Domain name: am.d
2
Domain name: am.m
25
Domain name: am.m.d
2
Domain name: am.m.q
1
Domain name: am.q
1
Domain name: an
25
Domain name: an.d
12
Domain name: an.m
25
Domain name: an.m.d
3
Domain name: ang
25
Domain name: ang.b
4
Domain name: ang.d
7
Domain name: ang.m
14
Domain name: ang.q
1
Domain name: ar
25
Domain name: ar.b
19
Domain name: ar.d
25
Domain name: ar.m
25
Domain name: ar.m.b
25
Domain name: ar.m.d
25
Domain name: ar.m.n
25
Domain name: ar.m.q
25
Domain name: ar.m.s
25
Domain name: ar.m.v
25
Domain name: ar.n