In [4]:
import requests
#import pandas as pd
import matplotlib.pyplot as plt
import polars as pl

In [5]:
# Convert edit_counts to a Polars DataFrame
def generate_dataframe(edit_counts):
    df = pl.DataFrame(edit_counts)
    df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%fZ"))
    print(df)
    return df

In [6]:
def find_peaks_rolling_3_years(df, threshold_percentage=0.30):
    rolling_means = []
    thresholds = []
    differences = []
    peaks = []

    for i in range(df.height):  # Iterate over rows
        current_timestamp = df[i, "timestamp"]

        # Get data within the last 3 years
        past_3_years_data = df.filter(
            (pl.col("timestamp") <= current_timestamp) &
            (pl.col("timestamp") >= current_timestamp - pl.duration(days=365*3))
        )

        # Calculate the rolling mean for the past 3 years
        rolling_mean = past_3_years_data["edits"].mean()
        rolling_means.append(rolling_mean)
        
        # Calculate the threshold
        threshold_value = rolling_mean * (1 + threshold_percentage / 100)
        thresholds.append(threshold_value)
        
        # Calculate the difference
        difference = df["edits"][i] - threshold_value
        differences.append(difference)
        
        # Identify peaks
        if df["edits"][i] > threshold_value:
            peaks.append(i)
    
    # Add new columns to the DataFrame
    df = df.with_columns(
        pl.Series(rolling_means).alias("rolling_mean"),
        pl.Series(thresholds).alias("threshold"),
        pl.Series(differences).alias("difference")
    )
    
    return peaks, df


In [7]:
def log_peaks(peaks, df):
    for peak_index in peaks:
        print(f"Peak Index: {peak_index}")
        print(f"Timestamp: {df['timestamp'][peak_index]}")
        print(f"Edits: {df['edits'][peak_index]}")
        print(f"Rolling Mean: {df['rolling_mean'][peak_index]:.2f}")
        print(f"Threshold: {df['threshold'][peak_index]:.2f}")
        print(f"Difference: {df['difference'][peak_index]:.2f}\n")


In [8]:
base_url = "https://wikimedia.org/api/rest_v1/metrics/edits/aggregate"

project = "uz.wikipedia.org"      
editor_type = "all-editor-types"  
page_type = "all-page-types"      
granularity = "monthly"           
start = "20140101"                
end = "20240101"  

rolling_window = 3
# Set the threshold percentage (30%)
threshold_percentage = 30


url = f"{base_url}/{project}/{editor_type}/{page_type}/{granularity}/{start}/{end}"

response = requests.get(url)


if response.status_code == 200:
    data = response.json()
    #print("Number of edits data:")
    #print(data)
else:
    print(f"Error: {response.status_code} - {response.text}")
    
    
edit_counts=data["items"][0]["results"]

df = generate_dataframe(edit_counts)
peaks,df = find_peaks_rolling_3_years(df, threshold_percentage)
log_peaks(peaks,df)

shape: (120, 2)
┌─────────────────────┬───────┐
│ timestamp           ┆ edits │
│ ---                 ┆ ---   │
│ datetime[μs]        ┆ i64   │
╞═════════════════════╪═══════╡
│ 2014-01-01 00:00:00 ┆ 2308  │
│ 2014-02-01 00:00:00 ┆ 2505  │
│ 2014-03-01 00:00:00 ┆ 2216  │
│ 2014-04-01 00:00:00 ┆ 6288  │
│ 2014-05-01 00:00:00 ┆ 5368  │
│ …                   ┆ …     │
│ 2023-08-01 00:00:00 ┆ 33798 │
│ 2023-09-01 00:00:00 ┆ 40689 │
│ 2023-10-01 00:00:00 ┆ 23974 │
│ 2023-11-01 00:00:00 ┆ 40802 │
│ 2023-12-01 00:00:00 ┆ 20211 │
└─────────────────────┴───────┘
Peak Index: 3
Timestamp: 2014-04-01 00:00:00
Edits: 6288
Rolling Mean: 3329.25
Threshold: 4328.03
Difference: 1959.97

Peak Index: 4
Timestamp: 2014-05-01 00:00:00
Edits: 5368
Rolling Mean: 3737.00
Threshold: 4858.10
Difference: 509.90

Peak Index: 11
Timestamp: 2014-12-01 00:00:00
Edits: 8661
Rolling Mean: 3445.00
Threshold: 4478.50
Difference: 4182.50

Peak Index: 12
Timestamp: 2015-01-01 00:00:00
Edits: 13979
Rolling Mean: 4255.31
Th

  df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%fZ"))
