In [1]:
import requests
from bs4 import BeautifulSoup
import json

base_url = "http://www.goso.co.kr/bbs/board.php?bo_table=testDB&page="
start_page = 1
end_page = 35000
step = 20

for batch_start in range(start_page, end_page + 1, step):
    batch_end = min(batch_start + step - 1, end_page)
    result = []

    for page in range(batch_start, batch_end + 1):
        url = f"{base_url}{page}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        for row in soup.select('tr'):
            category = row.select_one('.cate')
            title = row.select_one('.subject')
            datetime = row.select_one('.datetime')
            company = row.select_one('.company')
            num = row.select_one('.num')
            if category and title and '항공' in category.text:
                result.append(dict(
                    num=num.text.strip(),
                    title=title.text.strip(),
                    category=category.text.strip(),
                    datetime=datetime.text.strip() if datetime else "",
                    company=company.text.strip() if company else ""
                ))

    # Save JSON for the current batch
    if result:
        json_filename = f"prac_goso_{batch_start}_{batch_end}.json"
        with open(json_filename, 'w', encoding='utf-8') as json_file:
            json.dump(result, json_file, ensure_ascii=False)
        print(f"Saved {len(result)} records to {json_filename}")
    else:
        print(f"No data found for pages {batch_start} to {batch_end}")



Saved 11 records to prac_goso_2261_2280.json
Saved 16 records to prac_goso_2281_2300.json
Saved 24 records to prac_goso_2301_2320.json
Saved 10 records to prac_goso_2321_2340.json
Saved 12 records to prac_goso_2341_2360.json
Saved 11 records to prac_goso_2361_2380.json
Saved 22 records to prac_goso_2381_2400.json
Saved 11 records to prac_goso_2401_2420.json
Saved 12 records to prac_goso_2421_2440.json


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [1]:
import os
import json

# JSON 파일들이 저장된 디렉토리 경로
json_dir = './project'  # 현재 디렉토리 기준, 필요에 따라 수정
output_file = 'merged_goso.json'

# 모든 JSON 파일의 데이터를 병합할 리스트
merged_data = []

# 디렉토리 내 JSON 파일 목록 가져오기
json_files = [file for file in os.listdir(json_dir) if file.endswith('.json') and file.startswith('prac_goso_')]

for json_file in json_files:
    file_path = os.path.join(json_dir, json_file)
    with open(file_path, 'r', encoding='utf-8') as f:
        try:
            data = json.load(f)
            merged_data.extend(data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from file {json_file}: {e}")

# 병합된 데이터를 하나의 JSON 파일로 저장
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=4)

print(f"Merged JSON file created: {output_file}")

Merged JSON file created: merged_goso.json
