In [9]:
%pip install xmltodict bs4

Note: you may need to restart the kernel to use updated packages.


In [11]:
import re
import json
from bs4 import BeautifulSoup

def clean_text(text):
    if text is None:
        return ""
    
    # BeautifulSoup을 사용하여 HTML 파싱
    soup = BeautifulSoup(text, 'html.parser')
    # 텍스트만 추출
    text = soup.get_text()
    # 연속된 공백 제거
    text = re.sub(r'\s+', ' ', text)
    # 앞뒤 공백 제거
    return text.strip()

def extract_tag_content(text, tag_name):
    pattern = f'<{tag_name}>(.*?)</{tag_name}>'
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1) if match else ''

def process_xml_file():
    books = []
    
    with open('국립중앙도서관_사서추천도서.xml', 'r', encoding='utf-8') as file:
        content = file.read()
        
    # 각 item 블록을 찾기
    items = re.findall(r'<item>(.*?)</item>', content, re.DOTALL)
    
    for item in items:
        try:
            book = {
                'category_code': extract_tag_content(item, 'drCode'),
                'category_name': extract_tag_content(item, 'drCodeName'),
                'title': extract_tag_content(item, 'recomtitle'),
                'author': extract_tag_content(item, 'recomauthor'),
                'publisher': extract_tag_content(item, 'recompublisher'),
                'isbn': extract_tag_content(item, 'recomisbn'),
                'contents': clean_text(extract_tag_content(item, 'recomcontens')),
                'table_of_contents': clean_text(extract_tag_content(item, 'recommokcha')),
                'publish_year': extract_tag_content(item, 'publishYear'),
                'recommend_year': extract_tag_content(item, 'recomYear'),
                'recommend_month': extract_tag_content(item, 'recomMonth')
            }
            books.append(book)
        except Exception as e:
            print(f"Error processing item: {e}")
            continue
    
    # JSON 파일로 저장
    with open('library_books.json', 'w', encoding='utf-8') as f:
        json.dump(books, f, ensure_ascii=False, indent=2)
    
    return len(books)

# 실행
try:
    count = process_xml_file()
    print(f"성공적으로 {count}개의 도서 정보를 변환했습니다.")
except Exception as e:
    print(f"변환 중 오류 발생: {e}")

성공적으로 1388개의 도서 정보를 변환했습니다.
