In [1]:
%load_ext autoreload
%autoreload 2

In [54]:
import re
import os
import traceback
from tqdm import tqdm

In [122]:
from config import START_DATE, END_DATE, META_DATA_FOLDER_PATH

from data_utils import load_data, get_unique_articles, extract_main_domain, extract_page_components, get_article_domain_tuples

from meta_data_utils import fetch_revision_data, aggregate_daily_data

In [120]:
articles = get_unique_articles()

In [123]:
len(get_article_domain_tuples())

39802

In [64]:
start_date, end_date

(datetime.datetime(2015, 1, 1, 0, 0), datetime.datetime(2023, 12, 31, 0, 0))

In [90]:
ERROR_LOG = 'error_article_domains.txt'
NO_REVISIONS_LOG = 'no_revisions_article_domains.txt'

def sanitize_filename(filename):
    return re.sub(r'[\\/:"*?<>|]+', '_', filename)

def save_daily_agg_meta_data(article, domain, daily_data, output_folder):
    if not daily_data.empty:
        sanitized_article = sanitize_filename(article)
        output_path = os.path.join(output_folder, f"{sanitized_article}_{domain}.csv")
        daily_data.to_csv(output_path, index=False)

if not os.path.exists(META_DATA_FOLDER_PATH):
    os.makedirs(META_DATA_FOLDER_PATH)

error_articles = []
no_revisions_articles = []

for article, domain in tqdm(article_domain_tuples, total=len(article_domain_tuples)):
    try:
        sanitized_article = sanitize_filename(article)

        if os.path.exists(os.path.join(META_DATA_FOLDER_PATH, f"{sanitized_article}_{domain}.csv")):
            continue

        existing_file_path = os.path.join(META_DATA_FOLDER_PATH, f"{sanitized_article}.csv")
        if os.path.exists(existing_file_path) and domain == 'en.wikipedia.org':
            new_file_path = os.path.join(META_DATA_FOLDER_PATH, f"{sanitized_article}_{domain}.csv")
            os.rename(existing_file_path, new_file_path)
            continue

        revisions = fetch_revision_data(article, domain, start_date, end_date)
        if not revisions:
            no_revisions_articles.append((article, domain))
            continue
        
        daily_data = aggregate_daily_data(revisions)
        save_daily_agg_meta_data(article, domain, daily_data, META_DATA_FOLDER_PATH)
    except Exception as e:
        error_articles.append((article, domain))
        print(f"Error processing article: {article} {domain}")
        print(traceback.format_exc())

if error_articles:
    with open(ERROR_LOG, 'w') as f:
        for article, domain in error_articles:
            f.write(f"{article} {domain}\n")
    print(f"Saved list of articles with errors to {ERROR_LOG}")

if no_revisions_articles:
    with open(NO_REVISIONS_LOG, 'w') as f:
        for article, domain in no_revisions_articles:
            f.write(f"{article} {domain}\n")
    print(f"Saved list of articles with no revisions to {NO_REVISIONS_LOG}")

 32%|█████████████████████████████████████████████████████████████████▌                                                                                                                                           | 12718/39802 [00:30<01:20, 336.53it/s]

Error processing article: Шеремет,_Павел_Григорьевич ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="ag

 32%|██████████████████████████████████████████████████████████████████▊                                                                                                                                           | 12905/39802 [00:33<05:00, 89.42it/s]

Error processing article: Путин,_Владимир_Владимирович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="

 34%|█████████████████████████████████████████████████████████████████████▍                                                                                                                                       | 13489/39802 [00:34<01:16, 345.18it/s]

Error processing article: Шойгу,_Сергей_Кужугетович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="agg

 34%|██████████████████████████████████████████████████████████████████████                                                                                                                                       | 13615/39802 [00:35<01:23, 313.63it/s]

Error processing article: Шувалов,_Игорь_Иванович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="agg")

 35%|██████████████████████████████████████████████████████████████████████▉                                                                                                                                      | 13784/39802 [00:35<01:19, 326.01it/s]

Error processing article: С-300 ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="agg")
  File "/Users/aj

 35%|███████████████████████████████████████████████████████████████████████▊                                                                                                                                     | 13947/39802 [00:36<01:27, 294.28it/s]

Error processing article: Кадыров,_Рамзан_Ахматович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="agg

 35%|████████████████████████████████████████████████████████████████████████                                                                                                                                     | 13987/39802 [00:37<03:05, 139.16it/s]

Error processing article: Донецкая_Народная_Республика ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="

 35%|████████████████████████████████████████████████████████████████████████▊                                                                                                                                     | 14072/39802 [00:41<09:21, 45.80it/s]

Error processing article: Россия ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="agg")
  File "/Users/a

 36%|██████████████████████████████████████████████████████████████████████████▎                                                                                                                                  | 14433/39802 [00:42<02:28, 171.27it/s]

Error processing article: Гуцериев,_Михаил_Сафарбекович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name=

 38%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                                               | 15040/39802 [00:44<01:34, 262.29it/s]

Error processing article: Захарченко,_Александр_Владимирович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_

 38%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                                               | 15102/39802 [00:45<02:47, 147.57it/s]

Error processing article: Зеленский,_Владимир_Александрович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_n

 38%|██████████████████████████████████████████████████████████████████████████████                                                                                                                               | 15147/39802 [00:46<03:20, 123.06it/s]

Error processing article: Медведев,_Дмитрий_Анатольевич ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name=

 39%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                                            | 15665/39802 [00:47<01:28, 273.59it/s]

Error processing article: Лукашенко,_Александр_Григорьевич ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_na

  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 36158/39802 [56:22<27:04,  2.24it/s]

Error processing article: Стрелков,_Игорь_Иванович ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="agg"

 91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 36307/39802 [56:59<22:37,  2.57it/s]

Error processing article: Луганская_Народная_Республика ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name=

 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 36382/39802 [57:25<23:31,  2.42it/s]

Error processing article: Катастрофа_Boeing_777_в_Донецкой_области ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_li

 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍               | 36763/39802 [1:00:25<10:15,  4.94it/s]

Error processing article: Авдеевка ru.wikipedia.org
Traceback (most recent call last):
  File "/var/folders/1k/l9m7dlqd1knbl3m543_55s4h0000gn/T/ipykernel_14673/3224756989.py", line 37, in <module>
    daily_data = aggregate_daily_data(revisions)
  File "/Users/ajaykarthicksenthilkumar/dev/personal/wiki-forcast/ajay/meta_data_utils.py", line 66, in aggregate_daily_data
    daily_aggregation = df.groupby('date').agg(
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 1432, in aggregate
    result = op.agg()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 190, in agg
    return self.agg_dict_like()
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/web_traffic_forecast/lib/python3.9/site-packages/pandas/core/apply.py", line 423, in agg_dict_like
    return self.agg_or_apply_dict_like(op_name="agg")
  File "/Users

  df['timestamp'] = pd.to_datetime(df['timestamp'])
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39802/39802 [1:07:14<00:00,  9.87it/s]

Saved list of articles with errors to error_article_domains.txt
Saved list of articles with no revisions to no_revisions_article_domains.txt





In [115]:
import requests
import pandas as pd
from tqdm import tqdm
from urllib.parse import quote


ERROR_LOG = 'error_articles_summary.txt'

def encode_article_title(title):
    symbols_to_encode = ['%', '/', '?', "'", '"', '&', '=', '#']
    for symbol in symbols_to_encode:
        title = title.replace(symbol, quote(symbol))
    return title

def fetch_wikipedia_summary(title, domain):
    encoded_title = encode_article_title(title)
    urls = [
        f"https://en.wikipedia.org/api/rest_v1/page/summary/{encoded_title}",
        f"https://{domain}/api/rest_v1/page/summary/{encoded_title}"
    ]
    
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.json().get('extract', '')
        except requests.RequestException as e:
            # print(f"Error fetching summary for {title} from {url}: {e}")
            pass
    
    return None

# summaries = []
# error_articles = []

for article, domain in tqdm(article_domain_tuples, total=len(article_domain_tuples)):
    existing_summary = summary_df[(summary_df['article'] == article) & (summary_df['domain'] == domain)]['summary']
    if not existing_summary.empty and pd.notna(existing_summary.iloc[0]):
        continue  # Skip if summary already exists

    summary = fetch_wikipedia_summary(article, domain)
    if summary is not None:
        summaries.append({'article': article, 'domain': domain, 'summary': summary})
    else:
        error_articles.append((article, domain))

# new_summaries_df = pd.DataFrame(summaries)
# updated_summary_df = pd.concat([summary_df, new_summaries_df], ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39802/39802 [07:12<00:00, 92.09it/s]


In [None]:
summary_df = pd.DataFrame(summaries)
summary_df.to_csv('article_summaries.csv', index=False)

if error_articles:
    with open(ERROR_LOG, 'w') as f:
        for article, domain in error_articles:
            f.write(f"{article},{domain}\n")
    print(f"Saved list of articles with errors to {ERROR_LOG}")


In [None]:
import requests
import json
from tqdm import tqdm

def fetch_categories(article_title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": article_title,
        "prop": "categories"
    }
    try:
        response = requests.get(url, params=params)
        data = response.json()
        pages = data['query']['pages']
        page_id = next(iter(pages))
        categories = [cat['title'] for cat in pages[page_id].get('categories', [])]
        return categories
    
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err} - {article_title}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err} - {article_title}")
    except Exception as err:
        print(f"An error occurred: {err} - {article_title}")
    return []

category_dict = {}

for article_title in tqdm(page_info_df['article'].unique(), total= page_info_df['article'].nunique()):
    categories = fetch_categories(article_title)
    category_dict[article_title] = categories

with open('category_data.json', 'w') as json_file:
    json.dump(category_dict, json_file, indent=4)

print("Category data has been written to 'category_data.json'.")

In [None]:
from collections import defaultdict
category_to_articles = defaultdict(list)
for article, categories in category_dict.items():
    for category in categories:
        category_to_articles[category].append(article)
len(category_to_articles)