In [None]:
import os
import json
import re
import wikipediaapi

In [None]:
def split_into_sentences(text):
    """
    Splits a block of text into sentences using a simple regex.
    For more robust splitting, consider using nltk.sent_tokenize.
    """
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

In [None]:
def clean_filename(filename):
    """
    Removes characters that are invalid in file names.
    """
    # Remove or replace characters that are invalid in filenames on most OSs.
    return re.sub(r'[\\/*?:"<>|]', "", filename)

In [None]:
def get_page_text_lines(page):
    """
    Extracts meaningful content from a wikipediaapi Page object.
    - The page title is added as the first line.
    - The summary (lead text) is split into sentences (one per line).
    - For each section and its subsections (except ignored ones),
      adds the section title and text (split into sentences).
    Returns a list of lines.
    """
    lines = []
    
    # Add page title.
    lines.append(page.title)
    
    # Add summary.
    if page.summary:
        lines.extend(split_into_sentences(page.summary))
    
    # Define section titles to ignore (case-insensitive).
    ignore_sections = {"see also", "references", "further reading", "external links", "notes", "citations", "bibliography"}
    
    def process_section(section):
        # Skip if the section title is one of the ignored sections.
        if section.title.strip().lower() in ignore_sections:
            return
        # Add the section title if available.
        if section.title:
            lines.append(section.title)
        # Add the section text.
        if section.text:
            lines.extend(split_into_sentences(section.text))
        # Process any subsections recursively.
        for subsec in section.sections:
            process_section(subsec)
    
    for sec in page.sections:
        process_section(sec)
    
    return lines

In [None]:
def main():
    # Change these variables as needed.
    topic = "Law"  # Topic name (for JSON lookup)
    json_filename = "Article JSONs/Law_articles.json"  # JSON file produced earlier.
    
    # Create a folder for the topic. Remove spaces to form a valid folder name.
    folder_name = topic.replace(" ", "")
    os.makedirs(folder_name, exist_ok=True)
    
    # Create a wikipediaapi object with a proper user-agent.
    wiki = wikipediaapi.Wikipedia(language='en', user_agent="UniCourseWikipediaBot (mehmetaltintas@etu.edu.tr)")
    
    # Load the JSON file with the articles.
    with open(json_filename, "r", encoding="utf-8") as f:
        articles = json.load(f)
    
    print(f"Found {len(articles)} articles in '{json_filename}'.")
    
    for article in articles:
        title = article.get("title")
        print(f"Processing page: {title}")
        
        page = wiki.page(title)
        if not page.exists():
            print(f"Page '{title}' does not exist or could not be fetched.")
            continue
        
        # Extract content lines from the page.
        content_lines = get_page_text_lines(page)
        content = "\n".join(content_lines)
        
        # Create a safe filename for the article.
        safe_title = clean_filename(title)
        file_path = os.path.join(folder_name, f"{safe_title}.txt")
        
        try:
            with open(file_path, "w", encoding="utf-8") as f_out:
                f_out.write(content)
            print(f"Saved content to '{file_path}'.")
        except Exception as e:
            print(f"Error writing file '{file_path}': {e}")

In [None]:
main()