In [None]:
from docx import Document
import re
import pandas as pd

In [None]:
# Load the Word document
doc = Document("your_file.docx")  # Replace with the actual file name
doc.paragraphs

# Extract all non-empty paragraphs
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

# Print basic information about the document
print(f"Document has {len(paragraphs)} paragraphs")
# Save the first 50 paragraphs to a text file
with open("first_50_paragraphs.txt", "w") as file:
    file.write("\n".join(paragraphs[:50]))
print("First 50 paragraphs saved to 'first_50_paragraphs.txt'")

# Analyze content of articles
# Ensure articles and df are defined
if 'articles' in globals() and 'df' in globals():
    articles_df = pd.DataFrame(articles)
    if len(articles) > 0:
        # Number of articles
        print(f"\nTotal articles found: {len(articles)}")
        
        # Years distribution (if 'year' column exists)
        if 'year' in df.columns:
            years = df['year'].value_counts().sort_index()
            print(f"\nArticles by year:\n{years}")
        else:
            print("\nNo 'year' column found in the DataFrame.")
        
        # Common topics
        from collections import Counter
        import re
        
        # Get all words from headlines
        all_words = ' '.join(df['headline'].str.lower()).split()
        # Remove common stop words
        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'to', 'for', 'in', 'on', 'with', 'of', 'by', 'as'}
        filtered_words = [word for word in all_words if word not in stop_words and len(word) > 2]
        
        # Count and display common words
        word_counts = Counter(filtered_words)
        most_common = word_counts.most_common(15)
        print("\nMost common words in headlines:")
        for word, count in most_common:
            print(f"  {word}: {count}")
        
        # Length statistics (if 'body_length' and 'headline_length' columns exist)
        if 'body_length' in df.columns and 'headline_length' in df.columns:
            print(f"\nAverage body length: {df['body_length'].mean():.1f} characters")
            print(f"Average headline length: {df['headline_length'].mean():.1f} characters")
        else:
            print("\nLength statistics columns not found in the DataFrame.")
        
        # Article sources (if available)
        if 'source' in df.columns and df['source'].notna().any():
            sources = df['source'].value_counts().head(5)
            print(f"\nTop sources:\n{sources}")
else:
    print("Variables 'articles' and 'df' are not defined in the current scope.")

In [None]:
# Load the Word document
doc = Document("your_file.docx")  # Replace with your actual file path

# Extract all non-empty paragraphs
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

# Split articles using "End of Document" as a delimiter
def split_articles_by_marker(paragraphs):
    articles = []
    current_article = []
    for para in paragraphs:
        current_article.append(para)
        if "End of Document" in para:
            articles.append(current_article)
            current_article = []
    if current_article:
        articles.append(current_article)
    return articles

# Revised function to extract fields from a single article block
def extract_article_fields(article):
    fields = {
        "headline": "",
        "source": "",
        "date": "",
        "copyright": "",
        "section": "",
        "length": "",
        "byline": "",
        "body": "",
        "notes": "",
        "load_date": ""
    }
    
    body_lines = []
    note_lines = []
    
    # Flags for collecting body/notes
    in_body = False
    in_notes = False
    
    # Collect header lines (before any marker)
    header_lines = []
    
    for line in article:
        # Check if line is a marker
        if line == "Body":
            in_body = True
            in_notes = False
            continue
        elif line == "Notes":
            in_body = False
            in_notes = True
            continue
        elif "End of Document" in line:
            in_body = False
            in_notes = False
            continue
        elif line.startswith("Load-Date:"):
            fields["load_date"] = line.replace("Load-Date:", "").strip()
            continue
        elif line.startswith("Section:"):
            fields["section"] = line.replace("Section:", "").strip()
            continue
        elif line.startswith("Length:"):
            fields["length"] = line.replace("Length:", "").strip()
            continue
        elif line.startswith("Byline:"):
            fields["byline"] = line.replace("Byline:", "").strip()
            continue
        elif line.startswith("Copyright"):
            fields["copyright"] += line + " "
            continue
        
        # Depending on our state, add line to header, body, or notes
        if in_body:
            body_lines.append(line)
        elif in_notes:
            note_lines.append(line)
        else:
            header_lines.append(line)
    
    # Process header_lines:
    # Assume the first line is the headline
    if header_lines:
        fields["headline"] = header_lines[0]
        
        # Next, concatenate subsequent lines (until we hit a date-like pattern) as source.
        source_lines = []
        date_line = ""
        for line in header_lines[1:]:
            # Check for a date pattern (e.g., "November 18, 2024")
            if re.search(r"[A-Za-z]+\s+\d{1,2},\s*\d{4}", line):
                date_line = line
                break
            else:
                source_lines.append(line)
        fields["source"] = " ".join(source_lines)
        fields["date"] = date_line

    fields["body"] = "\n".join(body_lines)
    fields["notes"] = "\n".join(note_lines)
    
    return fields

# Process the document and extract articles
articles = split_articles_by_marker(paragraphs)
structured_data = [extract_article_fields(article) for article in articles]

# Convert to DataFrame and save to CSV
df = pd.DataFrame(structured_data)
csv_path = "output_articles_revised.csv"
df.to_csv(csv_path, index=False)

print(f"CSV saved to {csv_path}")