In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.subplots as ps
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as pgo
import wmfdata as wmf

import nshahquinn as nsq

In [2]:
nsq.set_plotly_defaults()

## Data collection

This data covers an 89-day period, from 2022-09-08 to 2022-12-05, inclusive.

In [None]:
today = pd.Timestamp.now().round(freq="D")
yesterday = today - pd.Timedelta(1, unit="day")
start_date = today - pd.Timedelta(89, unit="day")
query_range = pd.date_range(start=start_date, end=yesterday, freq="D")

wmf.spark.create_session(type="yarn-large")
fragments = []

for day in query_range:
    day_str = day.strftime("%Y-%m-%d")
    
    data = wmf.spark.run(
        f"""
        SELECT
            '{day_str}' AS time,
            geocoded_data['country_code'] AS country,
            access_method,
            referer_class,
            user_agent LIKE '%BytedanceWebview%' AS tiktok_user_agent,
            referer LIKE '%tiktok.com%' AS tiktok_referrer,
            COUNT(*) AS views
        FROM wmf.webrequest
        WHERE
            (user_agent LIKE "%BytedanceWebview%" OR referer LIKE '%tiktok.com%')
            AND webrequest_source = "text"
            AND is_pageview
            AND year = {day.year}
            AND month = {day.month}
            AND day = {day.day}
        GROUP BY
            geocoded_data['country_code'],
            access_method,
            referer_class,
            user_agent LIKE '%BytedanceWebview%',
            referer LIKE '%tiktok.com%'
        """
    )
    
    fragments.append(data)
    
    now = pd.Timestamp.now().round(freq="S")
    with Path("tiktok_query_progress.txt").open("a") as f:
        f.write(f"[{now}] Finished query for {day_str}.\n")

tiktok_traffic = pd.concat(fragments)

tiktok_traffic.to_parquet("tiktok_traffic", index=False)

In [None]:
tiktok_traffic = pd.read_parquet("tiktok_traffic")

tiktok_traffic

## Traffic by referrer type and user agent

In [16]:
(
    tiktok_traffic
    .groupby(["referer_class", "tiktok_user_agent", "tiktok_referrer"])
    ["views"]
    .sum()
    .to_frame()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,views
referer_class,tiktok_user_agent,tiktok_referrer,Unnamed: 3_level_1
external,False,True,11529
external,True,False,10314
external,True,True,909438
external (search engine),False,True,179
external (search engine),True,False,10860
internal,False,True,1800
internal,True,False,124366
none,True,False,20092
unknown,False,True,3
unknown,True,False,3


The internal-referred traffic without the TikTok user agent is most likely people with a referrer like `en.wikipedia.org/wiki/tiktok.com`, so it doesn't belong in our analysis. The search engine—referred traffic without the TikTok user agent is probably something similar; most external traffic now only sends the referring domain, but there may be enough exceptions (non-HTTPS traffic?) to account for those 179 views. I will drop both groups, as well as the random "unknown" referrer class.

Then, I will simplify the referrer classification into three groups: TikTok, other external, and internal.

In [34]:
d = tiktok_traffic.query(
    "(not tiktok_user_agent and referer_class in ('internal', 'external (search engine)'))"
    "or referer_class == 'unknown'"
).index

tiktok_traffic = tiktok_traffic.drop(d)

In [35]:
conditions = [
    tiktok_traffic["tiktok_referrer"],
    tiktok_traffic["referer_class"] != "internal",
    tiktok_traffic["referer_class"] == "internal"   
]

values = [
    "TikTok",
    "other external",
    "internal"
]

tiktok_traffic["referrer_type"] = np.select(conditions, values)

(
    tiktok_traffic
    .groupby(["tiktok_user_agent", "referrer_type"])
    ["views"]
    .sum()
    # Convert to daily average
    .div(89)
    .to_frame()
    .style.format(formatter="{:,.0f}")
)

Unnamed: 0_level_0,Unnamed: 1_level_0,views
tiktok_user_agent,referrer_type,Unnamed: 2_level_1
False,TikTok,130
True,TikTok,10218
True,internal,1397
True,other external,464


Looking at traffic we recorded from TikTok's in-app browser, we see the following daily averages:
* 10,000 page views with TikTok referrers
* 500 page views with other external referrers
* 1,400 page views with internal referrers

Very roughly, we can interpret this to mean that every day TikTok generates about 10,000 visits to Wikipedia, and about 10% of these visits include following an internal link to second page. 

We also get about 100 pageviews per day with a TikTok referrer but from a different browser. These likely come from the web version of TikTok.

## Country-specific traffic

In [None]:
cdc = wmf.spark.run("""
SELECT *
FROM canonical_data.countries
""").set_index("iso_code")

def get_country_name(iso_code):
    return cdc.loc[iso_code, "name"]

The 20 top countries for TikTok-referred traffic:

In [None]:
(
    tiktok_traffic
    .query("referrer_type == 'TikTok'")
    .groupby("country")
    ["views"]
    .sum()
    .div(89)
    .astype(int)
    .sort_values(ascending=False)
    .to_frame()
    .reset_index()
    .assign(country=lambda df: df["country"].map(get_country_name))
    .set_index("country")
    .head(20)
    .style.format(formatter="{:,.0f}")
)

In [51]:
countries = ("BR", "GB", "ID", "JP", "MX", "PH", "RU", "US", "VN")
country_data = {}

for country in countries:
    country_data[country] = (
        tiktok_traffic
        .query("country == @country and tiktok_referrer")
        .groupby("time")
        ["views"].sum()
        .reset_index()
    )

In [54]:
pio.renderers['jupyterlab'].config["toImageButtonOptions"] = {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'plotly_graph',
    'height': 300*len(country_data),
    'width': 800,
    'scale': 3 # Multiply title/legend/axis/canvas sizes by this factor
}

In [None]:
fig = ps.make_subplots(
    rows=len(country_data),
    subplot_titles=[get_country_name(c) for c in country_data.keys()],
    vertical_spacing=0.03
)

i = 1

for country, data in country_data.items():
    fig.add_trace(
        pgo.Scatter(
            x=data["time"],
            y=data["views"],
            line_color="blue",
            line_width=6
        ),
        row=i,
        col=1
    )
    
    fig.update_yaxes(title="pageviews per day", row=i, col=1)

    i += 1

fig.update_layout(
    showlegend=False,
    # width=1000,
    height=300*len(country_data),
    title_text="TikTok-referred pageviews by country",
    title_x=0.5
)

fig.update_layout(margin={
    "t": 75,
    "b": 25,
})

fig.update_yaxes(rangemode="tozero")

fig.show()

## Comparable overall traffic

Since the overall level of TikTok-referred traffic seems much lower than in my previous analysis, I wanted to exactly replicate the daily TikTok-referred traffic graph produced in [my previous analysis](https://github.com/nshahquinn/misc-wikimedia-analysis/blob/master/2021/2021-11_TikTok_Jumps_traffic.ipynb).

Unlike the analysis above, this covers 90 days of traffic, from 2022-09-07 to 2022-12-05, and only looks at mobile web traffic.

In [None]:
today = pd.Timestamp.now().round(freq="D")
yesterday = today - pd.Timedelta(1, unit="day")
query_range = pd.date_range(end=yesterday, periods=90, freq="D")

fragments = []

for day in query_range:
    day_str = day.strftime("%Y-%m-%d")
    
    data = wmf.spark.run(
        f"""
        SELECT
            '{day_str}' AS time,
            access_method,
            SUM(CAST(referer LIKE '%tiktok.com%' AS INT)) AS tiktok_referrer_pageviews
        FROM wmf.webrequest
        WHERE
            webrequest_source = "text"
            AND is_pageview
            AND year = {day.year}
            AND month = {day.month}
            AND day = {day.day}
        GROUP BY
            access_method
        """
    )
    
    fragments.append(data)
    
    now = pd.Timestamp.now().round(freq="S")
    with Path("tiktok_query_progress.txt").open("a") as f:
        f.write(f"[{now}] Finished query for {day_str}.\n")

replicated_tiktok_traffic = pd.concat(fragments)



In [None]:
mw_rtt = (
    replicated_tiktok_traffic
    .query("access_method == 'mobile web'")
    .drop("access_method", axis="columns")
    .assign(time=lambda df: pd.to_datetime(df["time"]))
)

Total page views:

In [74]:
mw_rtt["tiktok_referrer_pageviews"].sum()

923913

Daily average page views (compared to 45,975 previously):

In [26]:
int(923913/90)

10265

In [None]:
fig = px.line(mw_rtt, x="time", y="tiktok_referrer_pageviews")

fig.update_traces(line_color="#1f77b4", line_width=3)

fig.update_layout(
    font_family="Arial",
    title_text="TikTok-referred mobile web traffic",
    title_x=0.5,
    height=500,
    plot_bgcolor="white"
)

fig.update_xaxes(showline=True, linewidth=1, linecolor='black', gridcolor='#bbbbbb', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', gridcolor='#bbbbbb', mirror=True)

fig.update_yaxes(title="pageviews per day", rangemode="tozero", tickformat=",.0f")
fig.update_xaxes(title=None)

fig.show()

![](2022-12_daily_TikTok_traffic.png)

## Most viewed articles

This data covers the 89-day period from 2022-09-16 to 2022-12-13, inclusive.

In [None]:
today = pd.Timestamp.now().round(freq="D")
yesterday = today - pd.Timedelta(1, unit="day")
start_date = today - pd.Timedelta(89, unit="day")
query_range = pd.date_range(start=start_date, end=yesterday, freq="D")

wmf.spark.create_session(type="yarn-large")
fragments = []

for day in query_range:
    day_str = day.strftime("%Y-%m-%d")
    
    f = wmf.spark.run(
        f"""
        SELECT
            '{day_str}' as time,
            pageview_info['project'] AS project,
            pageview_info['page_title'] AS page_title,
            COUNT(*) AS views
        FROM wmf.webrequest
        WHERE
            referer LIKE 'https://www.tiktok.com%'
            AND webrequest_source = "text"
            AND is_pageview
            AND year = {day.year}
            AND month = {day.month}
            AND day = {day.day}
        GROUP BY
            pageview_info['project'],
            pageview_info['page_title']
        """
    )
    
    fragments.append(f)
    
    now = pd.Timestamp.now().round(freq="S")
    with Path("tiktok_query_progress.txt").open("a") as f:
        f.write(f"[{now}] Finished queries for {day_str}.\n")

traffic_by_page = pd.concat(fragments)

traffic_by_page.to_parquet("tiktok_traffic_by_page.parquet", index=False)

22/12/14 01:11:52 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/12/14 01:11:52 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
22/12/14 01:11:52 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
22/12/14 01:11:52 WARN Utils: Service 'sparkDriver' could not bind on port 12002. Attempting port 12003.
22/12/14 01:11:52 WARN Utils: Service 'sparkDriver' could not bind on port 12003. Attempting port 12004.
22/12/14 01:11:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/12/14 01:11:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/12/14 01:11:53 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/12/14 01:11:53 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attemptin

In [28]:
traffic_by_page = (
    pd.read_parquet("tiktok_traffic_by_page.parquet")
    .groupby(["project", "page_title"])
    ["views"]
    .sum()
    .sort_values(ascending=False)
    .reset_index()
    .assign(
        project=lambda df: df["project"] + ".org",
        url=lambda df: "https://" + df["project"] + "/wiki/" + df["page_title"],
        page_title=lambda df: df["page_title"].str.replace("_", " "),
        rank=lambda df: df.index + 1
    )
    [["rank", "url", "views"]]
)

In [None]:
(
    traffic_by_page
    .query("views >= 1000")
    .style.format(formatter={"views": "{:,.0f}"})
    .pipe(wmf.utils.pd_display_all)
)