In [None]:
import os
import csv
import re
import PyPDF2
from pathlib import Path

def extract_pdf_content(pdf_path):
    """
    從PDF檔案中提取文本內容並清理格式
    """
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            content = ""
            
            # 讀取所有頁面的文本
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    content += page_text + "\n"
            
            # 清理文本內容
            content = clean_text(content)
            return content.strip()
            
    except Exception as e:
        print(f"讀取PDF檔案 {pdf_path} 時發生錯誤: {e}")
        return ""

def clean_text(text):
    """
    清理文本內容，移除多餘的空白和格式問題
    """
    import re
    
    # 移除多餘的空白字符
    text = re.sub(r'\s+', ' ', text)
    
    # 移除行首行尾空白
    lines = text.split('\n')
    cleaned_lines = [line.strip() for line in lines if line.strip()]
    
    # 重新組合文本，保持適當的段落分隔
    text = ' '.join(cleaned_lines)
    
    # 移除重複的標點符號和特殊字符
    text = re.sub(r'[^\w\s\u4e00-\u9fff.,;:!?()（），。；：！？\-]', '', text)
    
    return text

def get_title_from_filename(filename):
    """
    從檔案名稱獲取標題（移除副檔名）
    """
    return Path(filename).stem

def pdf_to_csv(data_pdf_folder, data_csv_folder):
    """
    將PDF資料夾中的所有PDF檔案轉換為CSV檔案
    """
    # 確保輸出資料夾存在
    os.makedirs(data_csv_folder, exist_ok=True)
    
    # 獲取PDF資料夾中的所有PDF檔案
    pdf_folder = Path(data_pdf_folder)
    pdf_files = list(pdf_folder.glob("*.pdf"))
    
    if not pdf_files:
        print(f"在 {data_pdf_folder} 中沒有找到PDF檔案")
        return
    
    print(f"找到 {len(pdf_files)} 個PDF檔案")
    
    for pdf_file in pdf_files:
        print(f"正在處理: {pdf_file.name}")
        
        # 提取PDF內容
        content = extract_pdf_content(pdf_file)
        
        # 從檔名獲取標題
        title = get_title_from_filename(pdf_file.name)
        
        # 生成對應的CSV檔名 - 使用title作為檔名
        csv_filename = f"{title}.csv"
        csv_path = Path(data_csv_folder) / csv_filename
        
        # 寫入CSV檔案
        try:
            with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
                # 寫入標題行
                writer.writerow(['title', 'content'])
                # 寫入資料
                writer.writerow([title, content])
            
            print(f"成功轉換: {pdf_file.name} -> {csv_filename}")
        except Exception as e:
            print(f"寫入CSV檔案 {csv_filename} 時發生錯誤: {e}")

def main():
    # 設定資料夾路徑
    data_pdf_folder = "data_pdf_folder"
    data_csv_folder = "data_csv_folder"
    
    # 檢查PDF資料夾是否存在
    if not os.path.exists(data_pdf_folder):
        print(f"PDF資料夾 '{data_pdf_folder}' 不存在")
        return
    
    # 執行轉換
    pdf_to_csv(data_pdf_folder, data_csv_folder)
    print("轉換完成！")

if __name__ == "__main__":
    main()