In [1]:
#standard deviation
#t-test
#seperate functions to find rolling means and calculating peaks
#significant peaks to be excluded from calculating means but should be shown as peaks in the logs

In [42]:
import requests
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import duckdb

In [3]:
def generate_dataframe(edit_counts):
    # Convert data to a pandas DataFrame
    df = pd.DataFrame(edit_counts)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df

In [34]:
# Function to identify peaks based on the rolling mean of the past 3 years
def find_peaks_rolling_3_years(df, threshold_percentage=0.50):
    peaks = []
    
    # Iterate over each timestamp
    for i in range(len(df)):  # Start from the first element
        # Get data within the last 3 years
        past_3_years_data = df[df['timestamp'] <= df['timestamp'][i]]
        past_3_years_data = past_3_years_data[past_3_years_data['timestamp'] >= (df['timestamp'][i] - pd.DateOffset(years=3))]

        # Calculate the rolling mean of the last 3 years (average of 'edits' in the past 3 years)
        rolling_mean = past_3_years_data['edits'].mean()

        # Calculate the threshold (mean + 40%)
        threshold = rolling_mean * (1 + threshold_percentage)
        
        #calculate percenrage difference
        percentage_difference = (df["edits"][i]-rolling_mean)*100/rolling_mean
        
        # Check if the current value is above the threshold
        if df['edits'][i] >= threshold:
            peaks.append((df['timestamp'][i], df['edits'][i], rolling_mean, threshold, percentage_difference))

    return peaks

In [37]:
# Log peaks (timestamp, edits, rolling mean, threshold)
def log_peaks(peaks):
    for peak in peaks:
        print(f"Peak: {peak[0].strftime('%Y-%m-%d')}, Edits: {peak[1]}, Rolling Mean: {peak[2]:.2f}, Threshold: {peak[3]:.2f}, percentage difference : {peak[4]: .2f}")


In [38]:
base_url = "https://wikimedia.org/api/rest_v1/metrics/edits/aggregate"

project = "uz.wikipedia.org"      
editor_type = "all-editor-types"  
page_type = "content"      
granularity = "monthly"           
start = "20200101"                
end = "20240101"  

rolling_window = 3
# Set the threshold percentage (30%)
threshold_percentage = 0.30


url = f"{base_url}/{project}/{editor_type}/{page_type}/{granularity}/{start}/{end}"

response = requests.get(url)


if response.status_code == 200:
    data = response.json()
    #print("Number of edits data:")
    #print(data)
else:
    print(f"Error: {response.status_code} - {response.text}")
    
    
edit_counts=data["items"][0]["results"]

df = generate_dataframe(edit_counts)
peaks = find_peaks_rolling_3_years(df, threshold_percentage)
log_peaks(peaks)

Peak: 2020-03-01, Edits: 5272, Rolling Mean: 3354.67, Threshold: 4361.07, percentage difference :  57.15
Peak: 2020-04-01, Edits: 5760, Rolling Mean: 3956.00, Threshold: 5142.80, percentage difference :  45.60
Peak: 2020-12-01, Edits: 5525, Rolling Mean: 4100.00, Threshold: 5330.00, percentage difference :  34.76
Peak: 2021-01-01, Edits: 12253, Rolling Mean: 4727.15, Threshold: 6145.30, percentage difference :  159.20
Peak: 2021-07-01, Edits: 8814, Rolling Mean: 4579.53, Threshold: 5953.38, percentage difference :  92.47
Peak: 2021-08-01, Edits: 14835, Rolling Mean: 5092.30, Threshold: 6619.99, percentage difference :  191.32
Peak: 2021-09-01, Edits: 65605, Rolling Mean: 7973.86, Threshold: 10366.01, percentage difference :  722.75
Peak: 2021-10-01, Edits: 59519, Rolling Mean: 10316.82, Threshold: 13411.86, percentage difference :  476.91
Peak: 2021-11-01, Edits: 31284, Rolling Mean: 11228.43, Threshold: 14596.97, percentage difference :  178.61
Peak: 2021-12-01, Edits: 51113, Rolling 

In [39]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['timestamp'], y=df['edits'], mode='lines+markers', name='Edits', line=dict(color='blue')))

peak_timestamps = [peak[0] for peak in peaks]
peak_values = [peak[1] for peak in peaks]
fig.add_trace(go.Scatter(x=peak_timestamps, y=peak_values, mode='markers', name='Peaks Above Threshold', 
                         marker=dict(color='red', size=10, symbol='circle')))

fig.update_layout(
    title="Edits count over time with peaks over a threshold(30%) with rolling mean of 3 years",
    xaxis_title="Timestamp",
    yaxis_title="Count(Edits)",
    xaxis=dict(tickformat="%Y-%m-%d", tickangle=45),
    showlegend=True
)

# Display the plot
fig.show()


In [40]:
my_df=pd.DataFrame(peaks,columns=[["Timestamp","Edit_count","Rolling_mean","Threshold","Percentage_Difference"]])
my_df

Unnamed: 0,Timestamp,Edit_count,Rolling_mean,Threshold,Percentage_Difference
0,2020-03-01 00:00:00+00:00,5272,3354.666667,4361.066667,57.154213
1,2020-04-01 00:00:00+00:00,5760,3956.0,5142.8,45.601618
2,2020-12-01 00:00:00+00:00,5525,4100.0,5330.0,34.756098
3,2021-01-01 00:00:00+00:00,12253,4727.153846,6145.3,159.204595
4,2021-07-01 00:00:00+00:00,8814,4579.526316,5953.384211,92.46532
5,2021-08-01 00:00:00+00:00,14835,5092.3,6619.99,191.322192
6,2021-09-01 00:00:00+00:00,65605,7973.857143,10366.014286,722.751133
7,2021-10-01 00:00:00+00:00,59519,10316.818182,13411.863636,476.912367
8,2021-11-01 00:00:00+00:00,31284,11228.434783,14596.965217,178.614078
9,2021-12-01 00:00:00+00:00,51113,12890.291667,16757.379167,296.523223


In [43]:

duckdb.sql("CREATE TABLE community_alert_logs_table AS SELECT * FROM my_df")

duckdb.sql("INSERT INTO community_alert_logs_table SELECT * FROM my_df")

In [49]:

duckdb.sql("SELECT * FROM community_alert_logs_table LIMIT 10")

┌───────────────────────────┬─────────────────┬────────────────────┬────────────────────┬────────────────────────────┐
│      ('Timestamp',)       │ ('Edit_count',) │ ('Rolling_mean',)  │   ('Threshold',)   │ ('Percentage_Difference',) │
│ timestamp with time zone  │      int64      │       double       │       double       │           double           │
├───────────────────────────┼─────────────────┼────────────────────┼────────────────────┼────────────────────────────┤
│ 2020-03-01 05:30:00+05:30 │            5272 │ 3354.6666666666665 │  4361.066666666667 │         57.154213036565984 │
│ 2020-04-01 05:30:00+05:30 │            5760 │             3956.0 │             5142.8 │         45.601617795753285 │
│ 2020-12-01 05:30:00+05:30 │            5525 │             4100.0 │             5330.0 │          34.75609756097561 │
│ 2021-01-01 05:30:00+05:30 │           12253 │  4727.153846153846 │  6145.299999999999 │         159.20459538183653 │
│ 2021-07-01 05:30:00+05:30 │            8814 │ 