In [2]:
import os
import shutil
import xml.etree.ElementTree as ET
import pandas as pd

xml_directory = 'Users/ykcmia/cllt_finalproject/NTU/2020_2023/'
output_directory = 'goal_xml'
os.makedirs(output_directory, exist_ok=True)

# 刪除前一次執行產生的檔案
previous_files = os.listdir(output_directory)
for file_name in previous_files:
    file_path = os.path.join(output_directory, file_name)
    os.remove(file_path)

for filename in os.listdir(xml_directory):
    if filename.endswith('.xml'):
        file_path = os.path.join(xml_directory, filename)
        output_path = os.path.join(output_directory, filename)
        shutil.copyfile(file_path, output_path)

# 輸出所有「留言」dataframe
comment_data = pd.DataFrame(columns=['Comment', 'Author'])
previous_result_file = 'result.csv'
if os.path.isfile(previous_result_file):
    os.remove(previous_result_file)
for filename in os.listdir(output_directory):
    if filename.endswith('.xml'):
        file_path = os.path.join(output_directory, filename)
        tree = ET.parse(file_path)
        root = tree.getroot()
        for comment in root.iter('comment'):
            author = comment.attrib.get('author', '')
            content = ''
            for w in comment.iter('w'):
                if w.text:
                    content += w.text
            comment_data = comment_data.append({'Comment': content, 'Author': author}, ignore_index=True)
comment_data.to_csv('comment.csv', index=False)

# 輸出所有「內容」dataframe
content_data = pd.DataFrame(columns=['Content', 'Author'])
previous_result_file = 'result.csv'
if os.path.isfile(previous_result_file):
    os.remove(previous_result_file)
for filename in os.listdir(output_directory):
    if filename.endswith('.xml'):
        file_path = os.path.join(output_directory, filename)
        tree = ET.parse(file_path)
        root = tree.getroot()
        for body in root.iter('body'):
            author = body.attrib.get('author', '')
            content = ''
            for s in body.iter('s'):
                for w in s.iter('w'):
                    if w.text:
                        content += w.text
            content_data = content_data.append({'Content': content, 'Author': author}, ignore_index=True)
content_data.to_csv('content.csv', index=False)

# 合併兩個CSV檔案
merged_data = pd.concat([content_data, comment_data], axis=1)

# 重新命名欄位
merged_data.rename(columns={'Content': 'content', 'Author': 'content_author',
                            'Comment': 'comment', 'Author': 'comment_author'}, inplace=True)

# 儲存合併後的資料到新的CSV檔案
merged_data.to_csv('merged_data.csv', index=False)

In [3]:
import csv


def merge_page_content(input_file, output_file):
    with open(input_file, 'r', newline='') as csv_file:
        reader = csv.reader(csv_file)
        next(reader)  # 跳過標題列

        merged_content = []

        for row in reader:
            page_content = row[1]
            merged_content.append(page_content)

    with open(output_file, 'w') as txt_file:
        for content in merged_content:
            txt_file.write(f"{content}\n")


In [4]:
input_file = 'NTU_library.csv'  # 輸入的CSV檔案名稱
output_file = 'NTU_library.txt'  # 輸出的TXT檔案名稱

merge_page_content(input_file, output_file)