# Лабораторная работа №1

Я выбрал для работы научные публикации связанные с темой **Gambling**

## Выгрузка данных

In [None]:
import requests
import xml.etree.ElementTree as ET
import json
import os
import time

API = "http://export.arxiv.org/api/query"
THEME = "gambling"
OUTPUT_FILE = "gambling_data.json"
MAX_DATA_SIZE = 500

def get_gambling_data_xml(start=0, batch_size=100):
    query = f"search_query=all:{THEME}&start={start}&max_results={batch_size}"
    url = f"{API}?{query}"

    response = requests.get(url)
    response.raise_for_status()

    root = ET.fromstring(response.content)

    return root

def read_xml(xml_root: ET.Element, output: list):
    namespace = {"atom": "http://www.w3.org/2005/Atom"}
    rows_added = 0
    for entry in xml_root.findall('atom:entry', namespace):
        authors = entry.findall('atom:author', namespace)
        row = {
            "title": entry.find('atom:title', namespace).text,
            "id": entry.find('atom:id', namespace).text,
            "published": entry.find('atom:published', namespace).text,
            "summary": entry.find('atom:summary', namespace).text,
            "authors": [author.find('atom:name', namespace).text for author in authors]
        }
        output.append(row)
        rows_added += 1
    return rows_added

def fetch_gambling_data(output_file=OUTPUT_FILE):
    batch_size = 100
    total_rows = 0
    data = []

    while total_rows < MAX_DATA_SIZE:
        print(f"Fetching data from {total_rows} to {total_rows + batch_size}")
        xml_root = get_gambling_data_xml(total_rows, batch_size)
        rows_added = read_xml(xml_root, data)
        if rows_added == 0:
            print("No more data found, exiting")
            break

        total_rows += rows_added
        print(f"Added {rows_added} rows to {output_file}, total rows: {total_rows}")
        time.sleep(1)
    
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)

fetch_gambling_data()

Fetching data from 0 to 100
Added 100 rows to gambling_data.json, total rows: 100
Fetching data from 100 to 200
Added 100 rows to gambling_data.json, total rows: 200
Fetching data from 200 to 300
Added 100 rows to gambling_data.json, total rows: 300
Fetching data from 300 to 400
Added 100 rows to gambling_data.json, total rows: 400
Fetching data from 400 to 500
Added 61 rows to gambling_data.json, total rows: 461
Fetching data from 461 to 561
