In [1]:
pip install mwclient

Note: you may need to restart the kernel to use updated packages.


In [2]:
import mwclient
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Binance_coin'] #Binance_coin page

In [3]:
revs = list(page.revisions()) #page revision , big list of edits

In [4]:
revs[0]

OrderedDict([('revid', 1112780811),
             ('parentid', 1112780768),
             ('user', 'Bovlb'),
             ('timestamp',
              time.struct_time(tm_year=2022, tm_mon=9, tm_mday=28, tm_hour=2, tm_min=58, tm_sec=2, tm_wday=2, tm_yday=271, tm_isdst=-1)),
             ('comment',
              'Undid revision 1112780768 by [[Special:Contributions/Bovlb|Bovlb]] ([[User talk:Bovlb|talk]])')])

In [5]:
revs = sorted(revs, key=lambda rev: rev["timestamp"]) 

In [6]:
revs[0] #first entry

OrderedDict([('revid', 1027518566),
             ('parentid', 0),
             ('user', 'Skakkle'),
             ('timestamp',
              time.struct_time(tm_year=2021, tm_mon=6, tm_mday=8, tm_hour=11, tm_min=34, tm_sec=46, tm_wday=1, tm_yday=159, tm_isdst=-1)),
             ('comment',
              '[[WP:AES|←]]Redirected page to [[Binance#Cryptocurrencies]]')])

In [7]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [8]:
import torch
print(torch.__version__)

2.2.0


Finding the sentiment of page edits

In [9]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"] #strengt of the sentiment
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

  torch.utils._pytree._register_pytree_node(
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [10]:
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

In [11]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]

In [12]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [13]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2021-06-08,1,-0.995536,1.0
2021-06-23,1,-0.984262,1.0
2022-09-28,3,-0.946034,1.0


In [14]:
edits_df.index = pd.to_datetime(edits_df.index)

In [15]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [16]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [17]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,0,0.0,0.0
2009-03-09,0,0.0,0.0
2009-03-10,0,0.0,0.0
2009-03-11,0,0.0,0.0
2009-03-12,0,0.0,0.0
...,...,...,...
2024-04-11,0,0.0,0.0
2024-04-12,0,0.0,0.0
2024-04-13,0,0.0,0.0
2024-04-14,0,0.0,0.0


In [18]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [19]:
rolling_edits = rolling_edits.dropna()

In [20]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.0,0.0,0.0
2009-04-07,0.0,0.0,0.0
2009-04-08,0.0,0.0,0.0
2009-04-09,0.0,0.0,0.0
2009-04-10,0.0,0.0,0.0
...,...,...,...
2024-04-11,0.0,0.0,0.0
2024-04-12,0.0,0.0,0.0
2024-04-13,0.0,0.0,0.0
2024-04-14,0.0,0.0,0.0


In [21]:
rolling_edits.to_csv("wikipedia_edits_bnc.csv")