In [46]:
import os
from bs4 import BeautifulSoup
from cltk.lemmatize.lat import LatinBackoffLemmatizer
import stanza
import re
from tqdm import tqdm
import csv
from collections import Counter
import requests
import json

In [47]:
# Set working directory to project root
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("Current working directory: ", os.getcwd())

Current working directory:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [48]:
input_dir = 'data/raw'
output_dir = 'data/processed'
statistics_dir = 'data/statistics'
json_output_path = 'data/processed/documents_data.json'
csv_statistics_path = 'data/statistics/statistics_summary.csv'
csv_word_freq_path = 'data/statistics/word_frequency_summary.csv'

In [49]:
# Load Latin stopwords from URL
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
response.encoding = 'utf-8'
latin_stopwords = set(line.strip() for line in response.text.splitlines() if line.strip())

# Add additional stopwords
additional_stopwords = {
    'ego', 'mei', 'mihi', 'me', 'tu', 'tui', 'tibi', 'te',
    'nos', 'noster', 'nobis', 'vos', 'vester',
    'sui', 'sibi', 'se',
    'ab', 'ex', 'ad', 'in', 'de', 'per', 'cum', 'sub', 'pro',
    'ante', 'post', 'supra', 'et', 'ac', 'aut', 'nec', 'sed',
    'ut', 'si', 'atque', 'qui', 'quae', 'quod', 'quis', 'quid', 'non', 'ne'
}
latin_stopwords.update(additional_stopwords)

In [50]:
# Initialize CLTK lemmatizer
lemmatizer = LatinBackoffLemmatizer()

# Initialize Stanza NLP model for Latin
stanza.download('la')  # Download Latin model
nlp = stanza.Pipeline('la')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 27.6MB/s]                    
2024-11-20 23:57:16 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-20 23:57:16 INFO: Downloading default packages for language: la (Latin) ...
2024-11-20 23:57:16 INFO: File exists: /Users/jessie/stanza_resources/la/default.zip
2024-11-20 23:57:18 INFO: Finished downloading models and saved to /Users/jessie/stanza_resources
2024-11-20 23:57:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 26.9MB/s]                    
2024-11-20 23:57:18 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-20 23:57:18 INFO: Loading these model

In [51]:
# Preprocess text function
# Filters stopwords from the word list
def preprocess_text(words):
    filtered_words = [word for word in words if word.lower() not in latin_stopwords]
    return filtered_words

In [52]:
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        paragraphs = []
        stopwords_not_removed = []
        words_not_lemmatized = []
        sections = []

        # Iterate through all `fw` tags to segment sections
        for fw_tag in soup.find_all('fw', {'type': 'n'}):
            section_content = []
            next_sibling = fw_tag.find_next_sibling()
            while next_sibling and next_sibling.name != 'fw':
                if next_sibling.name == 'p':
                    words = []
                    for w in next_sibling.find_all('w'):
                        # Use the value of the `rend` attribute if it exists
                        original_word = w.get('rend', w.get_text()).lower()

                        # Handle German words
                        if re.search(r'[äöüß]', original_word):
                            continue

                        # Handle 'cum' variants with numbers/special symbols
                        if re.match(r'^cum\W*\d*$', original_word):
                            continue

                        # Lemmatize with CLTK
                        lemma = lemmatizer.lemmatize([original_word])[0][1]
                        if not lemma:
                            lemma = original_word
                            words_not_lemmatized.append(original_word)

                        words.append(lemma)

                    # Filter stopwords and add paragraph to section content
                    filtered_words = preprocess_text(words)
                    section_content.append(' '.join(filtered_words))
                next_sibling = next_sibling.find_next_sibling()
            paragraphs.append({'content': ' '.join(section_content), 'topic': None})

        # Return sectioned paragraphs
        return paragraphs

In [53]:
# Save sections to JSON format
def save_to_json(output_path, paragraphs, document_id):
    data = {
        "documents": [
            {
                "document_id": document_id,
                "paragraphs": paragraphs
            }
        ]
    }
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)


In [54]:
# Perform POS tagging using Stanza

def pos_tag_text(text):
    doc = nlp(text)
    pos_tags = [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]
    return pos_tags

In [55]:
# Step 5: Calculate statistics and save to CSV
def calculate_statistics(paragraphs):
    total_words = 0
    total_tokens = 0
    word_freq = {}
    pos_freq = {}
    unique_words = set()

    for paragraph in tqdm(paragraphs, desc="POS标注进度"):  # Adding progress bar for POS tagging
        words = paragraph['content'].split()
        total_words += len(words)
        unique_words.update(words)
        for word in words:
            word_freq[word] = word_freq.get(word, 0) + 1

        # POS tagging for the paragraph
        pos_tags = pos_tag_text(paragraph['content'])
        for _, pos in pos_tags:
            pos_freq[pos] = pos_freq.get(pos, 0) + 1
    
    # Calculate total types (unique words)
    total_types = len(unique_words)

    return total_words, total_types, word_freq, pos_freq

In [56]:
processing_summary = {
    'total_files': 0,
    'failed_files': 0
}
all_documents = []
all_word_freq = {}
all_pos_freq = {}
total_words_all = 0
total_types_all = set()


In [58]:
for file_name in tqdm(os.listdir(input_dir), desc="处理文件"):
    if file_name.endswith('.xml') and file_name.startswith('v'):
        input_path = os.path.join(input_dir, file_name)
        processing_summary['total_files'] += 1

        try:
            # 调用处理函数
            paragraphs = process_file(input_path)
            
            # 准备保存的文档信息
            document_data = {
                "document_id": file_name,
                "paragraphs": paragraphs
            }
            all_documents.append(document_data)

            # 保存处理后的文本到 .txt 文件
            output_file_name = file_name.replace('.xml', '.txt')
            output_path = os.path.join(output_dir, output_file_name)
            with open(output_path, 'w', encoding='utf-8') as output_file:
                for idx, paragraph in enumerate(paragraphs, start=1):
                    output_file.write(f"Paragraph {idx}:\n")  # 添加换行符 \n
                    output_file.write(f"{paragraph['content']}\n\n")
            
            # 计算文档的统计信息
            total_words, total_types, word_freq, pos_freq = calculate_statistics(paragraphs)
            total_words_all += total_words
            total_types_all.update(word_freq.keys())

            # 合并词频和POS统计信息
            for word, freq in word_freq.items():
                all_word_freq[word] = all_word_freq.get(word, 0) + freq
            for pos, freq in pos_freq.items():
                all_pos_freq[pos] = all_pos_freq.get(pos, 0) + freq
                    
        except Exception as e:
            print(f"处理文件 {file_name} 时发生错误: {str(e)}")
            processing_summary['failed_files'] += 1

# 保存所有文档的预处理结果到 JSON
with open(json_output_path, 'w', encoding='utf-8') as json_file:
    json.dump({
        "processing_summary": processing_summary,
        "documents": all_documents
    }, json_file, ensure_ascii=False, indent=4)

# 保存统计信息到 CSV 文件
with open(csv_statistics_path, 'w', encoding='utf-8', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([
        'Document ID', 
        'Total Paragraphs', 
        'Total Words', 
        'Total Types',
        'POS Frequencies'
    ])
    
    # 写入整个数据库的统计信息
    writer.writerow([
        'ALL DOCUMENTS',
        '',  # No specific paragraph count for all documents
        total_words_all,
        len(total_types_all),
        json.dumps(all_pos_freq)  # 整个数据库的POS统计信息
    ])
    
    # 写入每个文档的统计信息
    for document in tqdm(all_documents, desc="统计文档进度"):
        total_paragraphs = len(document['paragraphs'])
        total_words = 0
        total_types = set()
        pos_freq = {}

        # 计算每个文档的统计信息
        for paragraph in document['paragraphs']:
            words = paragraph['content'].split()
            total_words += len(words)
            total_types.update(words)

            # POS tagging
            pos_tags = pos_tag_text(paragraph['content'])
            for _, pos in pos_tags:
                pos_freq[pos] = pos_freq.get(pos, 0) + 1

        # 写入CSV
        writer.writerow([
            document['document_id'],
            total_paragraphs,
            total_words,
            len(total_types),
            json.dumps(pos_freq)  # 将POS统计信息作为字符串保存到CSV中
        ])

# 保存每个词的词频和每个type的数量到单独的CSV文件
with open(csv_word_freq_path, 'w', encoding='utf-8', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['Word', 'Frequency'])
    for word, freq in all_word_freq.items():
        writer.writerow([word, freq])
    writer.writerow([])  # 空行分隔
    writer.writerow(['POS Tag', 'Frequency'])
    for pos, freq in all_pos_freq.items():
        writer.writerow([pos, freq])

POS标注进度: 100%|██████████| 31/31 [00:03<00:00, 10.24it/s]
POS标注进度: 100%|██████████| 15/15 [00:01<00:00, 13.62it/s]
POS标注进度: 100%|██████████| 12/12 [00:00<00:00, 12.42it/s]
POS标注进度: 100%|██████████| 23/23 [00:04<00:00,  5.14it/s]
POS标注进度: 100%|██████████| 50/50 [00:06<00:00,  8.24it/s]
POS标注进度: 100%|██████████| 31/31 [00:02<00:00, 10.37it/s]
POS标注进度: 100%|██████████| 3/3 [00:00<00:00, 24.58it/s]
POS标注进度: 100%|██████████| 5/5 [00:00<00:00, 13.23it/s]
POS标注进度: 100%|██████████| 24/24 [00:02<00:00,  9.34it/s]
POS标注进度: 100%|██████████| 59/59 [00:05<00:00, 11.48it/s]
POS标注进度: 100%|██████████| 25/25 [00:02<00:00, 12.49it/s]
POS标注进度: 100%|██████████| 12/12 [00:01<00:00,  7.08it/s]
POS标注进度: 100%|██████████| 5/5 [00:00<00:00, 15.53it/s]
POS标注进度: 100%|██████████| 15/15 [00:01<00:00, 13.25it/s]
POS标注进度: 100%|██████████| 31/31 [00:02<00:00, 11.22it/s]
POS标注进度: 100%|██████████| 10/10 [00:00<00:00, 12.13it/s]
POS标注进度: 100%|██████████| 5/5 [00:00<00:00, 10.85it/s]
POS标注进度: 100%|██████████| 26/26 [00:02<