In [19]:
import wmfdata as wmf

In [26]:
wikipediapreview_sites = wmf.presto.run("""
WITH wikipediapreview_stats_cleaned AS (
    SELECT
        REGEXP_REPLACE(
            CASE
                WHEN referer_host LIKE '%beed.world' THEN 'beed.world'
                WHEN referer_host LIKE '%bibleandplaces.com' THEN 'bibleandplaces.com'
                WHEN referer_host LIKE '%wikinames.net' THEN 'wikinames.net'
                WHEN referer_host LIKE '%preobrazhenskiy.art' THEN 'preobrazhenskiy.art'
                WHEN referer_host LIKE '%jurnalis.top' THEN 'jurnalis.top'
                ELSE referer_host
            END,
            '^www\.'
        ) AS website,
        wmf_region_class AS region,
        SUM(previews) AS previews,
        SUM(IF(device_type = 'touch', previews, 0)) AS touch_previews
    FROM wmf_product.wikipediapreview_stats
    LEFT JOIN ntsako.country_meta_data
    ON country_code = iso2_country_code
    WHERE
        year = 2022 
        AND (
            month = 8 AND day >= 10
            OR month = 9 AND day < 7
        )
        AND referer_host NOT IN (
            '0.0.0.0',
            '127.0.0.1',
            'blog-wikimedia-org-develop.go-vip.net',
            'cdpn.io',
            'diff.wikimedia.org',
            'lumion.imaggo-work.pl',
            'localhost',
            'wikimediadiff.test',
            'wikimediafoundation.org',
            'wikimediafoundation-org-develop.go-vip.co',
            'wikimedia.github.io',
            'www.wixwikipediapreviewtest.com',
            '-'
        )
        AND referer_host NOT LIKE '%.local'
        AND referer_host NOT LIKE '%.ngrok.io' 
        AND referer_host NOT LIKE '192.%'
        AND referer_host NOT LIKE '%.wikipedia.org'
        AND referer_host NOT LIKE '%facebook.com'
        AND referer_host NOT LIKE '%google.com'
        AND referer_host NOT LIKE '%.test'
        AND referer_host IS NOT NULL
    GROUP BY
        REGEXP_REPLACE(
            CASE
                WHEN referer_host LIKE '%beed.world' THEN 'beed.world'
                WHEN referer_host LIKE '%bibleandplaces.com' THEN 'bibleandplaces.com'
                WHEN referer_host LIKE '%wikinames.net' THEN 'wikinames.net'
                WHEN referer_host LIKE '%preobrazhenskiy.art' THEN 'preobrazhenskiy.art'
                WHEN referer_host LIKE '%jurnalis.top' THEN 'jurnalis.top'
                ELSE referer_host
            END,
            '^www\.'
        ),
        wmf_region_class
)
SELECT
    website,
    CAST(SUM(previews) AS REAL) / 4 AS weekly_previews,
    CAST(SUM(touch_previews) AS REAL) / CAST(SUM(previews) AS REAL) AS touch_previews_share,
    MAX_BY(region, previews) AS top_user_region
FROM wikipediapreview_stats_cleaned
GROUP BY website
ORDER BY weekly_previews DESC
""")

In [27]:
wikipediapreview_sites

Unnamed: 0,website,weekly_previews,touch_previews_share,top_user_region
0,stehn-online.de,527.25,0.003319,Northern & Western Europe
1,framablog.org,282.25,0.025686,Northern & Western Europe
2,keynerd.it,269.50,0.000000,Northern & Western Europe
3,xpressenglish.com,191.00,0.073298,North America
4,lumion.pl,146.00,0.047945,Central & Eastern Europe & Central Asia
...,...,...,...,...
162,singletonabbey.co.uk,0.25,1.000000,Northern & Western Europe
163,blogs.gcpawards.com,0.25,0.000000,South Asia
164,lucidcreative.co.il,0.25,0.000000,Middle East & North Africa
165,liam.rs,0.25,0.000000,North America


In [28]:
wikipediapreview_sites.to_csv('~/wikipediapreview_sites.csv', index=False)