In [32]:
# резервный список категорий на случай если парсинг сайта не сработает
TAXONOMY_R = {'math.AC': {'name': 'Commutative Algebra',
  'description': 'Commutative rings, modules, ideals, homological algebra, computational aspects, invariant theory, connections to algebraic geometry and combinatorics'},
 'math.AG': {'name': 'Algebraic Geometry',
  'description': 'Algebraic varieties, stacks, sheaves, schemes, moduli spaces, complex geometry, quantum cohomology'},
 'math.AP': {'name': 'Analysis of PDEs',
  'description': "Existence and uniqueness, boundary conditions, linear and non-linear operators, stability, soliton theory, integrable PDE's, conservation laws, qualitative dynamics"},
 'math.AT': {'name': 'Algebraic Topology',
  'description': 'Homotopy theory, homological algebra, algebraic treatments of manifolds'},
 'math.CA': {'name': 'Classical Analysis and ODEs',
  'description': "Special functions, orthogonal polynomials, harmonic analysis, ODE's, differential relations, calculus of variations, approximations, expansions, asymptotics"},
 'math.CO': {'name': 'Combinatorics',
  'description': 'Discrete mathematics, graph theory, enumeration, combinatorial optimization, Ramsey theory, combinatorial game theory'},
 'math.CT': {'name': 'Category Theory',
  'description': 'Enriched categories, topoi, abelian categories, monoidal categories, homological algebra'},
 'math.CV': {'name': 'Complex Variables',
  'description': 'Holomorphic functions, automorphic group actions and forms, pseudoconvexity, complex geometry, analytic spaces, analytic sheaves'},
 'math.DG': {'name': 'Differential Geometry',
  'description': 'Complex, contact, Riemannian, pseudo-Riemannian and Finsler geometry, relativity, gauge theory, global analysis'},
 'math.DS': {'name': 'Dynamical Systems',
  'description': 'Dynamics of differential equations and flows, mechanics, classical few-body problems, iterations, complex dynamics, delayed differential equations'},
 'math.FA': {'name': 'Functional Analysis',
  'description': 'Banach spaces, function spaces, real functions, integral transforms, theory of distributions, measure theory'},
 'math.GM': {'name': 'General Mathematics',
  'description': 'Mathematical material of general interest, topics not covered elsewhere'},
 'math.GN': {'name': 'General Topology',
  'description': 'Continuum theory, point-set topology, spaces with algebraic structure, foundations, dimension theory, local and global properties'},
 'math.GR': {'name': 'Group Theory',
  'description': 'Finite groups, topological groups, representation theory, cohomology, classification and structure'},
 'math.GT': {'name': 'Geometric Topology',
  'description': 'Manifolds, orbifolds, polyhedra, cell complexes, foliations, geometric structures'},
 'math.HO': {'name': 'History and Overview',
  'description': 'Biographies, philosophy of mathematics, mathematics education, recreational mathematics, communication of mathematics, ethics in mathematics'},
 'math.IT': {'name': 'Information Theory',
  'description': 'math.IT is an alias for cs.IT. Covers theoretical and experimental aspects of information theory and coding.'},
 'math.KT': {'name': 'K-Theory and Homology',
  'description': 'Algebraic and topological K-theory, relations with topology, commutative algebra, and operator algebras'},
 'math.LO': {'name': 'Logic',
  'description': 'Logic, set theory, point-set topology, formal mathematics'},
 'math.MG': {'name': 'Metric Geometry',
  'description': 'Euclidean, hyperbolic, discrete, convex, coarse geometry, comparisons in Riemannian geometry, symmetric spaces'},
 'math.MP': {'name': 'Mathematical Physics',
  'description': 'math.MP is an alias for math-ph. Articles in this category focus on areas of research that illustrate the application of mathematics to problems in physics, develop mathematical methods for such applications, or provide mathematically rigorous formulations of existing physical theories. Submissions to math-ph should be of interest to both physically oriented mathematicians and mathematically oriented physicists; submissions which are primarily of interest to theoretical physicists or to mathematicians should probably be directed to the respective physics/math categories'},
 'math.NA': {'name': 'Numerical Analysis',
  'description': 'Numerical algorithms for problems in analysis and algebra, scientific computation'},
 'math.NT': {'name': 'Number Theory',
  'description': 'Prime numbers, diophantine equations, analytic number theory, algebraic number theory, arithmetic geometry, Galois theory'},
 'math.OA': {'name': 'Operator Algebras',
  'description': 'Algebras of operators on Hilbert space, C^*-algebras, von Neumann algebras, non-commutative geometry'},
 'math.OC': {'name': 'Optimization and Control',
  'description': 'Operations research, linear programming, control theory, systems theory, optimal control, game theory'},
 'math.PR': {'name': 'Probability',
  'description': 'Theory and applications of probability and stochastic processes: e.g. central limit theorems, large deviations, stochastic differential equations, models from statistical mechanics, queuing theory'},
 'math.QA': {'name': 'Quantum Algebra',
  'description': 'Quantum groups, skein theories, operadic and diagrammatic algebra, quantum field theory'},
 'math.RA': {'name': 'Rings and Algebras',
  'description': 'Non-commutative rings and algebras, non-associative algebras, universal algebra and lattice theory, linear algebra, semigroups'},
 'math.RT': {'name': 'Representation Theory',
  'description': 'Linear representations of algebras and groups, Lie theory, associative algebras, multilinear algebra'},
 'math.SG': {'name': 'Symplectic Geometry',
  'description': 'Hamiltonian systems, symplectic flows, classical integrable systems'},
 'math.SP': {'name': 'Spectral Theory',
  'description': 'Schrodinger operators, operators on manifolds, general differential operators, numerical studies, integral operators, discrete models, resonances, non-self-adjoint operators, random operators/matrices'},
 'math.ST': {'name': 'Statistics Theory',
  'description': 'Applied, computational and theoretical statistics: e.g. statistical inference, regression, time series, multivariate analysis, data analysis, Markov chain Monte Carlo, design of experiments, case studies'}
               }

In [38]:
import arxiv
import pandas as pd
import json
import time
import requests
import re
from itertools import combinations
from collections import Counter
from bs4 import BeautifulSoup

# ==========================================
# ЧАСТЬ 1: ПОЛУЧЕНИЕ ТАКСОНОМИИ (С ОПИСАНИЯМИ)
# ==========================================

def get_arxiv_taxonomy_with_descriptions(section_filter='math'):
    url = "https://arxiv.org/category_taxonomy"
    taxonomy = {}
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Находим заголовок Mathematics
        math_header = soup.find('h2', string=re.compile(r'Mathematics', re.IGNORECASE))
        
        if math_header:
            content_block = math_header.find_next_sibling('div', class_='accordion-body')
            if content_block:
                for h4 in content_block.find_all('h4'):
                    # Парсим заголовок: math.AG (Algebraic Geometry)
                    full_text = h4.get_text(separator=' ', strip=True)
                    match = re.search(r'(math\.[A-Z]{2,})\s*\((.+)\)', full_text)
                    
                    if match:
                        code = match.group(1).strip()
                        name = match.group(2).strip()
                        
                        # Парсим описание из соседней колонки
                        description = ""
                        parent_col = h4.find_parent('div', class_='column')
                        if parent_col:
                            desc_col = parent_col.find_next_sibling('div', class_='column')
                            if desc_col:
                                p_tag = desc_col.find('p')
                                if p_tag:
                                    description = p_tag.get_text(strip=True)
                        
                        taxonomy[code] = {'name': name, 'description': description}

    except Exception as e:
        print(f"Error fetching taxonomy: {e}")
        
    return taxonomy

print("1. Fetching taxonomy...")
TAXONOMY = get_arxiv_taxonomy_with_descriptions('math')

# Fallback (если сайт лежит)
if not TAXONOMY or len(TAXONOMY) < 5:
    print("Warning: Using fallback taxonomy.")
    TAXONOMY = TAXONOMY_R

# Берем ВСЕ категории для анализа
ALL_CATEGORIES = list(TAXONOMY.keys())
print(f"Taxonomy loaded: {len(ALL_CATEGORIES)} disciplines.")


1. Fetching taxonomy...
Taxonomy loaded: 32 disciplines.


In [None]:
# ==========================================
# ЧАСТЬ 2: СБОР БОЛЬШИХ ДАННЫХ (SCAN)
# ==========================================

# Настройки
ANALYSIS_DEPTH = 300       # Сколько статей скачивать для расчета ВЕСОВ (для каждой темы)
DISPLAY_LIMIT = 15         # Сколько статей отображать в графе (для каждой темы)

# Хранилища
global_pair_counter = Counter()  # Для подсчета весов связей между дисциплинами
candidates_per_topic = {topic: [] for topic in ALL_CATEGORIES} # Кандидаты на отображение
seen_articles = set() # Чтобы не дублировать подсчеты

client = arxiv.Client()

print(f"2. Scanning data (Depth: {ANALYSIS_DEPTH} articles per category)...")
print("   This determines link weights and finds the best articles.")

for topic in ALL_CATEGORIES:
    # Запрашиваем много статей для статистики
    search = arxiv.Search(
        query = f"cat:{topic}",
        max_results = ANALYSIS_DEPTH,
        sort_by = arxiv.SortCriterion.SubmittedDate
    )
    
    results = list(client.results(search))
    
    for r in results:
        article_id = r.entry_id.split('/')[-1]
        
        # Очищаем категории (оставляем только те, что есть в нашей таксономии)
        # Это важно, чтобы не связывать с физикой, если мы строим мат. граф
        relevant_cats = sorted([c for c in r.categories if c in TAXONOMY])
        
        # 1. Считаем статистику для ВЕСОВ СВЯЗЕЙ (Global Weights)
        # Считаем каждую пару только один раз для каждой статьи
        if article_id not in seen_articles:
            if len(relevant_cats) > 1:
                for pair in combinations(relevant_cats, 2):
                    global_pair_counter[pair] += 1
            seen_articles.add(article_id)
        
        # 2. Собираем данные для КАНДИДАТОВ
        # Рейтинг статьи = (Кол-во категорий * 10). 
        # Чем больше связей, тем интереснее статья для графа.
        score = len(relevant_cats) * 10
        
        article_data = {
            'id': article_id,
            'title': r.title.replace('\n', ' '),
            'abstract': r.summary.replace('\n', ' '),
            'authors': [a.name for a in r.authors],
            'date': r.published.strftime("%Y-%m-%d"),
            'primary_category': r.primary_category,
            'categories': relevant_cats,
            'url': r.entry_id,
            'score': score # Для сортировки
        }
        
        candidates_per_topic[topic].append(article_data)
        
    print(f"   Processed {topic}: found {len(results)} articles.")
    time.sleep(0.2) # Не бомбим API


2. Scanning data (Depth: 300 articles per category)...
   This determines link weights and finds the best articles.
   Processed math.AC: found 300 articles.
   Processed math.AG: found 300 articles.
   Processed math.AP: found 300 articles.
   Processed math.AT: found 300 articles.
   Processed math.CA: found 300 articles.
   Processed math.CO: found 300 articles.
   Processed math.CT: found 300 articles.
   Processed math.CV: found 300 articles.
   Processed math.DG: found 300 articles.


In [None]:
# ==========================================
# ЧАСТЬ 3: РАНЖИРОВАНИЕ И ОТБОР (TOP-15)
# ==========================================

print(f"3. Ranking and selecting Top-{DISPLAY_LIMIT} articles per category...")

final_nodes_dict = {} # Словарь id -> node, чтобы избежать дублей узлов
links = []

# --- 3.1 Создаем узлы ДИСЦИПЛИН (Все 32) ---
for code, info in TAXONOMY.items():
    if isinstance(info, str): # Обработка fallback старого типа
        name, desc = info, code
    else:
        name = info.get('name', code)
        desc = info.get('description', "")
        
    final_nodes_dict[code] = {
        "id": code,
        "label": name,
        "type": "discipline",
        "description": desc,
        "cluster": code,
        "val": 30 # Дисциплины делаем большими
    }

# --- 3.2 Отбираем лучшие СТАТЬИ ---
for topic in ALL_CATEGORIES:
    candidates = candidates_per_topic[topic]
    
    # Сортировка: Сначала по score (интердисциплинарность), потом по дате (свежесть)
    # Т.е. наверху будут свежие статьи, связывающие несколько областей
    candidates.sort(key=lambda x: (x['score'], x['date']), reverse=True)
    
    # Берем топ-15
    top_selection = candidates[:DISPLAY_LIMIT]
    
    for art in top_selection:
        # Добавляем узел статьи, если его еще нет
        if art['id'] not in final_nodes_dict:
            
            # Определяем кластер для раскраски
            main_cluster = art['primary_category']
            if main_cluster not in TAXONOMY:
                # Если первичная не мат., берем первую попавшуюся мат. категорию
                main_cluster = art['categories'][0] if art['categories'] else topic

            final_nodes_dict[art['id']] = {
                "id": art['id'],
                "label": art['title'],
                "type": "article",
                "description": art['abstract'],
                "authors": art['authors'],
                "cluster": main_cluster,
                "val": 5,
                "url": art['url']
            }
            
        # Добавляем связь CONTAINS (Дисциплина -> Статья)
        # Важно: связываем только с ТЕКУЩЕЙ дисциплиной в цикле, 
        # чтобы структура была логичной (или со всеми ее категориями?)
        # Лучше связать со всеми её категориями, которые есть в графе:
        for cat in art['categories']:
             links.append({
                "source": cat,
                "target": art['id'],
                "type": "CONTAINS",
                "val": 1
            })

# --- 3.3 Создаем связи ДИСЦИПЛИНА-ДИСЦИПЛИНА (на основе GLOBAL weights) ---
if global_pair_counter:
    max_weight = max(global_pair_counter.values())
    print(f"   Max co-occurrence strength: {max_weight}")

    for pair, count in global_pair_counter.items():
        # Добавляем связь только если вес значимый (например > 2 совпадений в большой выборке)
        if count >= 3: 
            links.append({
                "source": pair[0],
                "target": pair[1],
                "type": "RELATED",
                "label": f"{count} shared articles (annual)",
                # Нормируем толщину от 1 до 10
                "val": (count / max_weight) * 10
            })

# Чистим дубликаты связей (так как цикл по статьям мог добавить CONTAINS дважды)
# Превращаем список словарей в множеств кортежей JSON строк для уникальности, потом обратно
unique_links_set = set()
final_links = []
for l in links:
    # Генерируем уникальный ключ для связи
    link_id = f"{l['source']}-{l['target']}-{l['type']}"
    if link_id not in unique_links_set:
        final_links.append(l)
        unique_links_set.add(link_id)

nodes = list(final_nodes_dict.values())



In [None]:
# ==========================================
# ЧАСТЬ 4: СОХРАНЕНИЕ
# ==========================================

OUTPUT_FILE = "graph_data.json"
output_data = {
    "nodes": nodes,
    "links": final_links
}

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"DONE! Data saved to {OUTPUT_FILE}")
print(f"Nodes: {len(nodes)} (Should be approx {len(ALL_CATEGORIES) * DISPLAY_LIMIT} + 32)")
print(f"Links: {len(final_links)}")