In [6]:
# резервный список категорий на случай если парсинг сайта не сработает
TAXONOMY_R = {'math.AC': {'name': 'Commutative Algebra',
  'description': 'Commutative rings, modules, ideals, homological algebra, computational aspects, invariant theory, connections to algebraic geometry and combinatorics'},
 'math.AG': {'name': 'Algebraic Geometry',
  'description': 'Algebraic varieties, stacks, sheaves, schemes, moduli spaces, complex geometry, quantum cohomology'},
 'math.AP': {'name': 'Analysis of PDEs',
  'description': "Existence and uniqueness, boundary conditions, linear and non-linear operators, stability, soliton theory, integrable PDE's, conservation laws, qualitative dynamics"},
 'math.AT': {'name': 'Algebraic Topology',
  'description': 'Homotopy theory, homological algebra, algebraic treatments of manifolds'},
 'math.CA': {'name': 'Classical Analysis and ODEs',
  'description': "Special functions, orthogonal polynomials, harmonic analysis, ODE's, differential relations, calculus of variations, approximations, expansions, asymptotics"},
 'math.CO': {'name': 'Combinatorics',
  'description': 'Discrete mathematics, graph theory, enumeration, combinatorial optimization, Ramsey theory, combinatorial game theory'},
 'math.CT': {'name': 'Category Theory',
  'description': 'Enriched categories, topoi, abelian categories, monoidal categories, homological algebra'},
 'math.CV': {'name': 'Complex Variables',
  'description': 'Holomorphic functions, automorphic group actions and forms, pseudoconvexity, complex geometry, analytic spaces, analytic sheaves'},
 'math.DG': {'name': 'Differential Geometry',
  'description': 'Complex, contact, Riemannian, pseudo-Riemannian and Finsler geometry, relativity, gauge theory, global analysis'},
 'math.DS': {'name': 'Dynamical Systems',
  'description': 'Dynamics of differential equations and flows, mechanics, classical few-body problems, iterations, complex dynamics, delayed differential equations'},
 'math.FA': {'name': 'Functional Analysis',
  'description': 'Banach spaces, function spaces, real functions, integral transforms, theory of distributions, measure theory'},
 'math.GM': {'name': 'General Mathematics',
  'description': 'Mathematical material of general interest, topics not covered elsewhere'},
 'math.GN': {'name': 'General Topology',
  'description': 'Continuum theory, point-set topology, spaces with algebraic structure, foundations, dimension theory, local and global properties'},
 'math.GR': {'name': 'Group Theory',
  'description': 'Finite groups, topological groups, representation theory, cohomology, classification and structure'},
 'math.GT': {'name': 'Geometric Topology',
  'description': 'Manifolds, orbifolds, polyhedra, cell complexes, foliations, geometric structures'},
 'math.HO': {'name': 'History and Overview',
  'description': 'Biographies, philosophy of mathematics, mathematics education, recreational mathematics, communication of mathematics, ethics in mathematics'},
 'math.IT': {'name': 'Information Theory',
  'description': 'math.IT is an alias for cs.IT. Covers theoretical and experimental aspects of information theory and coding.'},
 'math.KT': {'name': 'K-Theory and Homology',
  'description': 'Algebraic and topological K-theory, relations with topology, commutative algebra, and operator algebras'},
 'math.LO': {'name': 'Logic',
  'description': 'Logic, set theory, point-set topology, formal mathematics'},
 'math.MG': {'name': 'Metric Geometry',
  'description': 'Euclidean, hyperbolic, discrete, convex, coarse geometry, comparisons in Riemannian geometry, symmetric spaces'},
 'math.MP': {'name': 'Mathematical Physics',
  'description': 'math.MP is an alias for math-ph. Articles in this category focus on areas of research that illustrate the application of mathematics to problems in physics, develop mathematical methods for such applications, or provide mathematically rigorous formulations of existing physical theories. Submissions to math-ph should be of interest to both physically oriented mathematicians and mathematically oriented physicists; submissions which are primarily of interest to theoretical physicists or to mathematicians should probably be directed to the respective physics/math categories'},
 'math.NA': {'name': 'Numerical Analysis',
  'description': 'Numerical algorithms for problems in analysis and algebra, scientific computation'},
 'math.NT': {'name': 'Number Theory',
  'description': 'Prime numbers, diophantine equations, analytic number theory, algebraic number theory, arithmetic geometry, Galois theory'},
 'math.OA': {'name': 'Operator Algebras',
  'description': 'Algebras of operators on Hilbert space, C^*-algebras, von Neumann algebras, non-commutative geometry'},
 'math.OC': {'name': 'Optimization and Control',
  'description': 'Operations research, linear programming, control theory, systems theory, optimal control, game theory'},
 'math.PR': {'name': 'Probability',
  'description': 'Theory and applications of probability and stochastic processes: e.g. central limit theorems, large deviations, stochastic differential equations, models from statistical mechanics, queuing theory'},
 'math.QA': {'name': 'Quantum Algebra',
  'description': 'Quantum groups, skein theories, operadic and diagrammatic algebra, quantum field theory'},
 'math.RA': {'name': 'Rings and Algebras',
  'description': 'Non-commutative rings and algebras, non-associative algebras, universal algebra and lattice theory, linear algebra, semigroups'},
 'math.RT': {'name': 'Representation Theory',
  'description': 'Linear representations of algebras and groups, Lie theory, associative algebras, multilinear algebra'},
 'math.SG': {'name': 'Symplectic Geometry',
  'description': 'Hamiltonian systems, symplectic flows, classical integrable systems'},
 'math.SP': {'name': 'Spectral Theory',
  'description': 'Schrodinger operators, operators on manifolds, general differential operators, numerical studies, integral operators, discrete models, resonances, non-self-adjoint operators, random operators/matrices'},
 'math.ST': {'name': 'Statistics Theory',
  'description': 'Applied, computational and theoretical statistics: e.g. statistical inference, regression, time series, multivariate analysis, data analysis, Markov chain Monte Carlo, design of experiments, case studies'}
               }

In [7]:
import arxiv
import pandas as pd
import json
import requests
import re
from bs4 import BeautifulSoup
from google.cloud import bigquery
from google.oauth2 import service_account
from datetime import datetime, timedelta
import json
import os

# ==========================================
# КОНФИГУРАЦИЯ
# ==========================================

# Путь к твоему ключу (как в твоем исходном файле)
KEY_PATH = "/home/nkrishelie/Python/Store/AllsoftEcom.json" 
PROJECT_ID = "burnished-yeti-250015"
DATASET_ID = "arXiv"
TABLE_ID = "articles"
FULL_TABLE_REF = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}"

# Настройки парсинга
DAYS_TO_FETCH = 1  # Скачиваем только за последнюю неделю (инкрементальная загрузка)
ANALYSIS_PERIOD_DAYS = 365 # Анализируем данные за год
TOP_LIMIT_PER_CAT = 15

# Инициализация клиента BQ
credentials = service_account.Credentials.from_service_account_file(KEY_PATH)
bq_client = bigquery.Client(project=PROJECT_ID, credentials=credentials)
arxiv_client = arxiv.Client()



In [8]:
# ==========================================
# ШАГ 1: УНИВЕРСАЛЬНАЯ ТАКСОНОМИЯ С КЭШИРОВАНИЕМ
# ==========================================

TAXONOMY_CACHE_FILE = "arxiv_taxonomy_cache.json"

def get_universal_taxonomy():
    """
    1. Парсит сайт arXiv.
    2. Если успешно и данных много -> сохраняет в JSON.
    3. Если неудачно или данных мало -> читает из JSON.
    """
    url = "https://arxiv.org/category_taxonomy"
    fetched_taxonomy = {}
    
    # --- 1. Попытка скачивания ---
    print("Attempting to fetch taxonomy from arXiv...")
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            sections = soup.find_all('h2', class_='accordion-head')
            for section in sections:
                section_name = section.get_text(strip=True)
                
                # Определяем группу
                group_code = "other"
                if "Mathematics" in section_name: group_code = "math"
                elif "Computer Science" in section_name: group_code = "cs"
                elif "Physics" in section_name: group_code = "physics"
                elif "Biology" in section_name: group_code = "bio"
                elif "Finance" in section_name: group_code = "fin"
                elif "Statistics" in section_name: group_code = "stat"
                elif "Economics" in section_name: group_code = "econ"
                elif "Electrical" in section_name: group_code = "eess"

                content_block = section.find_next_sibling('div', class_='accordion-body')
                if content_block:
                    for h4 in content_block.find_all('h4'):
                        full_text = h4.get_text(separator=' ', strip=True)
                        match = re.search(r'([a-z\-]+\.[A-Z\-a-z]{2,})\s*\((.+)\)', full_text)
                        
                        if match:
                            code = match.group(1).strip()
                            name = match.group(2).strip()
                            
                            desc = ""
                            parent = h4.find_parent('div', class_='column')
                            if parent:
                                desc_col = parent.find_next_sibling('div', class_='column')
                                if desc_col and desc_col.find('p'):
                                    desc = desc_col.find('p').get_text(strip=True)
                            
                            fetched_taxonomy[code] = {
                                'name': name, 
                                'description': desc,
                                'group': group_code
                            }
    except Exception as e:
        print(f"Error fetching from web: {e}")

    # --- 2. Работа с Кэшем ---
    cached_taxonomy = {}
    if os.path.exists(TAXONOMY_CACHE_FILE):
        try:
            with open(TAXONOMY_CACHE_FILE, 'r', encoding='utf-8') as f:
                cached_taxonomy = json.load(f)
            print(f"Found cached taxonomy with {len(cached_taxonomy)} entries.")
        except Exception as e:
            print(f"Error reading cache: {e}")

    # --- 3. Принятие решения (Validation & Fallback) ---
    
    # Если скачали меньше, чем было в кэше (или 0) — откатываемся на кэш
    if len(fetched_taxonomy) < len(cached_taxonomy):
        print(f"Warning: Fetched only {len(fetched_taxonomy)} categories, but cache has {len(cached_taxonomy)}.")
        print("Using CACHED version to ensure data integrity.")
        final_taxonomy = cached_taxonomy
        
    elif len(fetched_taxonomy) > 0:
        print(f"Success: Fetched {len(fetched_taxonomy)} categories (Cache had {len(cached_taxonomy)}).")
        print("Updating cache file...")
        # Сохраняем новую версию
        with open(TAXONOMY_CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(fetched_taxonomy, f, ensure_ascii=False, indent=2)
        final_taxonomy = fetched_taxonomy
        
    else:
        # И интернет не работает, и кэша нет
        print("Critical Warning: Web fetch failed and no cache found.")
        final_taxonomy = {}

    return final_taxonomy

# Запуск
TAXONOMY = get_universal_taxonomy()

# --- 4. Хардкод на самый крайний случай (если и файла нет) ---
if not TAXONOMY:
    print("Using hardcoded fallback.")
    TAXONOMY = TAXONOMY_R

print(f"Final Taxonomy loaded: {len(TAXONOMY)} categories.")

Attempting to fetch taxonomy from arXiv...
Success: Fetched 146 categories (Cache had 0).
Updating cache file...
Final Taxonomy loaded: 146 categories.


In [9]:
# ==========================================
# 2. EXTRACT & LOAD (ArXiv -> BigQuery)
# ==========================================

print(f"Fetching articles for the last {DAYS_TO_FETCH} days...")

# Поиск по всем мат. категориям сразу
search = arxiv.Search(
    query = "cat:math.*",
    max_results = 2000, # С запасом
    sort_by = arxiv.SortCriterion.SubmittedDate
)

new_rows = []
cutoff_date = datetime.now() - timedelta(days=DAYS_TO_FETCH)

for r in arxiv_client.results(search):
    pub_date = r.published.replace(tzinfo=None)
    if pub_date < cutoff_date:
        break # Дальше идут старые статьи, останавливаемся
    
    # Фильтруем категории (оставляем только те, что в таксономии)
    cats = [c for c in r.categories if c in TAXONOMY]
    if not cats: continue

    new_rows.append({
        "id": r.entry_id.split('/')[-1],
        "title": r.title.replace('\n', ' '),
        "abstract": r.summary.replace('\n', ' '),
        "authors": [a.name for a in r.authors],
        "categories": cats,
        "primary_category": r.primary_category,
        "published_date": pub_date.strftime("%Y-%m-%d"),
        "updated_at": datetime.now().isoformat(),
        "url": r.entry_id
    })

if new_rows:
    print(f"Uploading {len(new_rows)} new articles to BigQuery...")
    
    # Используем временную таблицу для безопасного MERGE
    temp_table_id = f"{PROJECT_ID}.{DATASET_ID}.temp_upload"
    
    # ИСПРАВЛЕНИЕ: Указываем ПОЛНУЮ схему для временной таблицы, 
    # чтобы она совпадала со структурой данных
    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("id", "STRING"),
            bigquery.SchemaField("title", "STRING"),
            bigquery.SchemaField("abstract", "STRING"),
            bigquery.SchemaField("authors", "STRING", mode="REPEATED"),
            bigquery.SchemaField("categories", "STRING", mode="REPEATED"),
            bigquery.SchemaField("primary_category", "STRING"),
            bigquery.SchemaField("published_date", "DATE"),
            bigquery.SchemaField("updated_at", "TIMESTAMP"),
            bigquery.SchemaField("url", "STRING"),
        ],
        write_disposition="WRITE_TRUNCATE", # Перезаписываем временную таблицу при каждом запуске
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    )
    
    # Загружаем данные
    try:
        job = bq_client.load_table_from_json(new_rows, temp_table_id, job_config=job_config)
        job.result() # Ждем завершения
        print("Temp table loaded.")

        # MERGE запрос: обновляет старые, вставляет новые
        # (Остается без изменений)
        merge_query = f"""
        MERGE `{FULL_TABLE_REF}` T
        USING `{temp_table_id}` S
        ON T.id = S.id
        WHEN MATCHED THEN
          UPDATE SET 
            T.title = S.title,
            T.categories = S.categories,
            T.updated_at = S.updated_at
        WHEN NOT MATCHED THEN
          INSERT (id, title, abstract, authors, categories, primary_category, published_date, updated_at, url)
          VALUES (id, title, abstract, authors, categories, primary_category, S.published_date, S.updated_at, url)
        """
        bq_client.query(merge_query).result()
        print("Merge complete.")
        
    except Exception as e:
        print(f"BigQuery Error: {e}")
        # Если ошибка валидации, выводим подробности
        if hasattr(job, 'errors') and job.errors:
            print("Detailed errors:", job.errors)
else:
    print("No new articles found.")



Fetching articles for the last 30 days...
Uploading 2000 new articles to BigQuery...
Temp table loaded.
Merge complete.


In [None]:
# ==========================================
# 3. TRANSFORM (SQL Analytics)
# ==========================================

print("Running SQL analytics...")

# А. Расчет весов связей (CROSS-CATEGORY CO-OCCURRENCE)
# Анализируем данные за год
links_query = f"""
WITH exploded AS (
    SELECT id, cat 
    FROM `{FULL_TABLE_REF}`, UNNEST(categories) cat
    WHERE published_date >= DATE_SUB(CURRENT_DATE(), INTERVAL {ANALYSIS_PERIOD_DAYS} DAY)
)
SELECT 
    t1.cat as source, 
    t2.cat as target, 
    COUNT(*) as weight
FROM exploded t1
JOIN exploded t2 ON t1.id = t2.id
WHERE t1.cat < t2.cat -- Избегаем дублей (A-B и B-A) и петель (A-A)
GROUP BY 1, 2
HAVING weight > 2 -- Фильтр шума
ORDER BY weight DESC
"""
links_df = bq_client.query(links_query).to_dataframe()

# Б. Выбор ТОП статей для узлов
# Ранжирование: (Кол-во категорий * 100) + (Дней до сегодня / 10)
# Смысл: Мультидисциплинарные статьи всплывают вверх, свежие тоже.
nodes_query = f"""
WITH scored_articles AS (
    SELECT 
        *,
        (ARRAY_LENGTH(categories) * 100) AS score
    FROM `{FULL_TABLE_REF}`
    WHERE published_date >= DATE_SUB(CURRENT_DATE(), INTERVAL {ANALYSIS_PERIOD_DAYS} DAY)
),
ranked AS (
    SELECT 
        *,
        ROW_NUMBER() OVER(PARTITION BY primary_category ORDER BY score DESC, published_date DESC) as rank
    FROM scored_articles
)
SELECT * FROM ranked WHERE rank <= {TOP_LIMIT_PER_CAT}
"""
nodes_df = bq_client.query(nodes_query).to_dataframe()


# ==========================================
# 4. JSON GENERATION (Graph Construction)
# ==========================================

final_nodes = []
final_links = []
seen_nodes = set()

# 4.1. Узлы Дисциплин
for code, info in TAXONOMY.items():
    final_nodes.append({
        "id": code,
        "label": info['name'],
        "type": "discipline",
        "description": info['description'],
        "cluster": code,
        "val": 30
    })
    seen_nodes.add(code)

# 4.2. Узлы Статей
for _, row in nodes_df.iterrows():
    # Массивы приходят как ndarray, конвертируем в list
    cats = list(row['categories'])
    auths = list(row['authors'])
    
    if row['id'] not in seen_nodes:
        final_nodes.append({
            "id": row['id'],
            "label": row['title'],
            "type": "article",
            "description": row['abstract'],
            "authors": auths,
            "cluster": row['primary_category'],
            "val": 5,
            "url": row['url']
        })
        seen_nodes.add(row['id'])
    
    # Связи CONTAINS (Дисциплина -> Статья)
    for c in cats:
        if c in TAXONOMY:
            final_links.append({
                "source": c,
                "target": row['id'],
                "type": "CONTAINS",
                "val": 1
            })

# 4.3. Связи RELATED (Дисциплина <-> Дисциплина) из SQL
max_w = links_df['weight'].max() if not links_df.empty else 1
for _, row in links_df.iterrows():
    if row['source'] in TAXONOMY and row['target'] in TAXONOMY:
        final_links.append({
            "source": row['source'],
            "target": row['target'],
            "type": "RELATED",
            "label": f"{row['weight']} shared articles",
            "val": (row['weight'] / max_w) * 10
        })

# 4.4. Связи DEPENDS (Статья <-> Статья) - оставляем в Python, 
# т.к. SQL для пересечения массивов строк сложен и дорог
# Работаем только с выбранными "топ" статьями
articles_list = [n for n in final_nodes if n['type'] == 'article']
from itertools import combinations

for a1, a2 in combinations(articles_list, 2):
    s1 = set(a1['authors'])
    s2 = set(a2['authors'])
    common = list(s1.intersection(s2))
    if common:
        final_links.append({
            "source": a1['id'],
            "target": a2['id'],
            "type": "DEPENDS",
            "label": "Authors: " + ", ".join(common),
            "val": len(common) * 2
        })

# Сохранение
output_data = {"nodes": final_nodes, "links": final_links}
with open("graph_data.json", 'w', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"Done. Nodes: {len(final_nodes)}, Links: {len(final_links)}")

In [None]:
#####  Код для прокачки истории статей за год (2025)
import calendar
from datetime import date, timedelta
import time

# ==========================================
# СПЕЦИАЛЬНЫЙ СКРИПТ: ИСТОРИЧЕСКАЯ ЗАГРУЗКА (BACKFILL)
# ==========================================

# Настраиваем период загрузки (весь 2025 год)
START_YEAR = 2025
START_MONTH = 1
END_YEAR = 2025
END_MONTH = 12 

print(f"Starting historical backfill from {START_YEAR}-{START_MONTH} to {END_YEAR}-{END_MONTH}...")

# Функция для генерации списка (год, месяц)
def get_month_range(start_year, start_month, end_year, end_month):
    current_year, current_month = start_year, start_month
    while (current_year < end_year) or (current_year == end_year and current_month <= end_month):
        yield current_year, current_month
        # Переход к следующему месяцу
        if current_month == 12:
            current_month = 1
            current_year += 1
        else:
            current_month += 1

# Проходим цикл по каждому месяцу
for year, month in get_month_range(START_YEAR, START_MONTH, END_YEAR, END_MONTH):
    
    # 1. Определяем даты начала и конца месяца
    # last_day возвращает кол-во дней в месяце (28, 30, 31)
    _, last_day = calendar.monthrange(year, month)
    
    # Формат дат для arXiv API: YYYYMMDDHHMM
    # submittedDate:[202501010000 TO 202501312359]
    date_query = f"submittedDate:[{year}{month:02d}010000 TO {year}{month:02d}{last_day}2359]"
    
    print(f"\nProcessing: {year}-{month:02d} (Query: {date_query})")
    
    # 2. Формируем запрос
    search = arxiv.Search(
        query = f"cat:math.* AND {date_query}",
        max_results = 10000, # С запасом на месяц (обычно в Math выходит 3-4к статей в месяц)
        sort_by = arxiv.SortCriterion.SubmittedDate
    )
    
    batch_rows = []
    
    # 3. Скачиваем статьи
    try:
        results = list(arxiv_client.results(search))
        print(f"   Found {len(results)} articles.")
        
        for r in results:
            # Фильтрация категорий
            cats = [c for c in r.categories if c in TAXONOMY]
            if not cats: continue

            batch_rows.append({
                "id": r.entry_id.split('/')[-1],
                "title": r.title.replace('\n', ' '),
                "abstract": r.summary.replace('\n', ' '),
                "authors": [a.name for a in r.authors],
                "categories": cats,
                "primary_category": r.primary_category,
                "published_date": r.published.strftime("%Y-%m-%d"),
                "updated_at": datetime.now().isoformat(),
                "url": r.entry_id
            })
            
    except Exception as e:
        print(f"   Error fetching from arXiv: {e}")
        continue

    # 4. Загружаем в BigQuery (если есть что грузить)
    if batch_rows:
        try:
            temp_table_id = f"{PROJECT_ID}.{DATASET_ID}.temp_upload"
            
            # Та самая ПОЛНАЯ схема, чтобы не было ошибки 400
            job_config = bigquery.LoadJobConfig(
                schema=[
                    bigquery.SchemaField("id", "STRING"),
                    bigquery.SchemaField("title", "STRING"),
                    bigquery.SchemaField("abstract", "STRING"),
                    bigquery.SchemaField("authors", "STRING", mode="REPEATED"),
                    bigquery.SchemaField("categories", "STRING", mode="REPEATED"),
                    bigquery.SchemaField("primary_category", "STRING"),
                    bigquery.SchemaField("published_date", "DATE"),
                    bigquery.SchemaField("updated_at", "TIMESTAMP"),
                    bigquery.SchemaField("url", "STRING"),
                ],
                write_disposition="WRITE_TRUNCATE",
                source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
            )
            
            # Заливаем во временную таблицу
            load_job = bq_client.load_table_from_json(batch_rows, temp_table_id, job_config=job_config)
            load_job.result()
            
            # Делаем MERGE в основную таблицу
            merge_query = f"""
            MERGE `{FULL_TABLE_REF}` T
            USING `{temp_table_id}` S
            ON T.id = S.id
            WHEN MATCHED THEN
              UPDATE SET 
                T.title = S.title,
                T.categories = S.categories,
                T.updated_at = S.updated_at
            WHEN NOT MATCHED THEN
              INSERT (id, title, abstract, authors, categories, primary_category, published_date, updated_at, url)
              VALUES (id, title, abstract, authors, categories, primary_category, S.published_date, S.updated_at, url)
            """
            bq_client.query(merge_query).result()
            print(f"   Uploaded {len(batch_rows)} articles to BigQuery.")
            
        except Exception as e:
            print(f"   BigQuery Error: {e}")
            if 'load_job' in locals() and load_job.errors:
                print(load_job.errors)
    else:
        print("   No articles to upload.")

    # 5. Пауза, чтобы arXiv не забанил IP
    print("   Sleeping 2 seconds...")
    time.sleep(2)

print("\nFull backfill complete!")

Starting historical backfill from 2025-1 to 2025-12...

Processing: 2025-01 (Query: submittedDate:[202501010000 TO 202501312359])
   Found 3668 articles.
   Uploaded 3668 articles to BigQuery.
   Sleeping 2 seconds...

Processing: 2025-02 (Query: submittedDate:[202502010000 TO 202502282359])
   Found 3667 articles.
   Uploaded 3667 articles to BigQuery.
   Sleeping 2 seconds...

Processing: 2025-03 (Query: submittedDate:[202503010000 TO 202503312359])
   Found 4185 articles.
   Uploaded 4185 articles to BigQuery.
   Sleeping 2 seconds...

Processing: 2025-04 (Query: submittedDate:[202504010000 TO 202504302359])
   Found 4009 articles.
   Uploaded 4009 articles to BigQuery.
   Sleeping 2 seconds...

Processing: 2025-05 (Query: submittedDate:[202505010000 TO 202505312359])
   Found 4075 articles.
   Uploaded 4075 articles to BigQuery.
   Sleeping 2 seconds...

Processing: 2025-06 (Query: submittedDate:[202506010000 TO 202506302359])
