In [None]:
from docx import Document
import re
import pandas as pd
import os

In [None]:
# Optionally, load one Word document for testing
doc = Document("your_file.docx")  # Replace with the actual file name
doc.paragraphs

# Extract all non-empty paragraphs
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

# Print basic information about the document
print(f"Document has {len(paragraphs)} paragraphs")
# Save the first 50 paragraphs to a text file
with open("first_50_paragraphs.txt", "w") as file:
    file.write("\n".join(paragraphs[:50]))
print("First 50 paragraphs saved to 'first_50_paragraphs.txt'")

# Analyze content of articles
# Ensure articles and df are defined
if 'articles' in globals() and 'df' in globals():
    articles_df = pd.DataFrame(articles)
    if len(articles) > 0:
        # Number of articles
        print(f"\nTotal articles found: {len(articles)}")
        
        # Years distribution (if 'year' column exists)
        if 'year' in df.columns:
            years = df['year'].value_counts().sort_index()
            print(f"\nArticles by year:\n{years}")
        else:
            print("\nNo 'year' column found in the DataFrame.")
        
        # Common topics
        from collections import Counter
        import re
        
        # Get all words from headlines
        all_words = ' '.join(df['headline'].str.lower()).split()
        # Remove common stop words
        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'to', 'for', 'in', 'on', 'with', 'of', 'by', 'as'}
        filtered_words = [word for word in all_words if word not in stop_words and len(word) > 2]
        
        # Count and display common words
        word_counts = Counter(filtered_words)
        most_common = word_counts.most_common(15)
        print("\nMost common words in headlines:")
        for word, count in most_common:
            print(f"  {word}: {count}")
        
        # Length statistics (if 'body_length' and 'headline_length' columns exist)
        if 'body_length' in df.columns and 'headline_length' in df.columns:
            print(f"\nAverage body length: {df['body_length'].mean():.1f} characters")
            print(f"Average headline length: {df['headline_length'].mean():.1f} characters")
        else:
            print("\nLength statistics columns not found in the DataFrame.")
        
        # Article sources (if available)
        if 'source' in df.columns and df['source'].notna().any():
            sources = df['source'].value_counts().head(5)
            print(f"\nTop sources:\n{sources}")
else:
    print("Variables 'articles' and 'df' are not defined in the current scope.")

In [None]:
# Load the Word document
doc = Document("your_file.docx")  # Replace with your actual file path

# Extract all non-empty paragraphs
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

# Split articles using "End of Document" as a delimiter
def split_articles_by_marker(paragraphs):
    articles = []
    current_article = []
    for para in paragraphs:
        current_article.append(para)
        if "End of Document" in para:
            articles.append(current_article)
            current_article = []
    if current_article:
        articles.append(current_article)
    return articles

# Revised function to extract fields from a single article block
def extract_article_fields(article):
    fields = {
        "headline": "",
        "source": "",
        "date": "",
        "copyright": "",
        "section": "",
        "length": "",
        "byline": "",
        "body": "",
        "notes": "",
        "load_date": ""
    }
    
    body_lines = []
    note_lines = []
    
    # Flags for collecting body/notes
    in_body = False
    in_notes = False
    
    # Collect header lines (before any marker)
    header_lines = []
    
    for line in article:
        # Check if line is a marker
        if line == "Body":
            in_body = True
            in_notes = False
            continue
        elif line == "Notes":
            in_body = False
            in_notes = True
            continue
        elif "End of Document" in line:
            in_body = False
            in_notes = False
            continue
        elif line.startswith("Load-Date:"):
            fields["load_date"] = line.replace("Load-Date:", "").strip()
            continue
        elif line.startswith("Section:"):
            fields["section"] = line.replace("Section:", "").strip()
            continue
        elif line.startswith("Length:"):
            fields["length"] = line.replace("Length:", "").strip()
            continue
        elif line.startswith("Byline:"):
            fields["byline"] = line.replace("Byline:", "").strip()
            continue
        elif line.startswith("Copyright"):
            fields["copyright"] += line + " "
            continue
        
        # Depending on our state, add line to header, body, or notes
        if in_body:
            body_lines.append(line)
        elif in_notes:
            note_lines.append(line)
        else:
            header_lines.append(line)
    
    # Process header_lines:
    # Assume the first line is the headline
    if header_lines:
        fields["headline"] = header_lines[0]
        
        # Next, concatenate subsequent lines (until we hit a date-like pattern) as source.
        source_lines = []
        date_line = ""
        for line in header_lines[1:]:
            # Check for a date pattern (e.g., "November 18, 2024")
            if re.search(r"[A-Za-z]+\s+\d{1,2},\s*\d{4}", line):
                date_line = line
                break
            else:
                source_lines.append(line)
        fields["source"] = " ".join(source_lines)
        fields["date"] = date_line

    fields["body"] = "\n".join(body_lines)
    fields["notes"] = "\n".join(note_lines)
    
    return fields

# Process the document and extract articles
articles = split_articles_by_marker(paragraphs)
structured_data = [extract_article_fields(article) for article in articles]

# Convert to DataFrame and save to CSV
df = pd.DataFrame(structured_data)
csv_path = "output_articles_revised.csv"
df.to_csv(csv_path, index=False)

print(f"CSV saved to {csv_path}")

In [None]:
import base64
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# Gmail API scope to read messages
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

# Folder where attachments will be saved
ATTACH_DIR = 'gmail_attachments'
os.makedirs(ATTACH_DIR, exist_ok=True)

def authenticate():
    """Authenticate and return the Gmail API service."""
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return build('gmail', 'v1', credentials=creds)

def save_attachment(service, msg_id):
    """Download and save attachments from a message, with unique filenames."""
    try:
        msg = service.users().messages().get(userId='me', id=msg_id).execute()
        payload = msg.get('payload', {})
        parts = payload.get('parts', [])
        attachment_count = 0

        for part in parts:
            filename = part.get('filename')
            body = part.get('body', {})
            if filename and 'attachmentId' in body:
                att_id = body['attachmentId']
                attachment = service.users().messages().attachments().get(
                    userId='me', messageId=msg_id, id=att_id).execute()
                data = attachment.get('data')
                file_data = base64.urlsafe_b64decode(data.encode('UTF-8'))

                # Ensure unique filename
                name, ext = os.path.splitext(filename)
                safe_msg_id = msg_id.replace('/', '_')
                unique_name = f"{name}__{safe_msg_id}__{attachment_count}{ext}"
                file_path = os.path.join(ATTACH_DIR, unique_name)

                with open(file_path, 'wb') as f:
                    f.write(file_data)
                print(f"Downloaded: {unique_name}")
                attachment_count += 1

    except Exception as e:
        print(f"Error with message {msg_id}: {e}")

def download_all_attachments(service):
    """Loop through Gmail messages in Primary tab and download their attachments."""
    user_id = 'me'
    page_token = None
    total_processed = 0

    while True:
        response = service.users().messages().list(
            userId=user_id,
            q='has:attachment category:primary',
            maxResults=100,
            pageToken=page_token
        ).execute()

        messages = response.get('messages', [])
        print(f"Processing {len(messages)} messages...")

        for msg in messages:
            save_attachment(service, msg['id'])
            total_processed += 1

        page_token = response.get('nextPageToken')
        if not page_token:
            break

    print(f"\nDone. Processed {total_processed} messages.")

# Run the script
service = authenticate()
download_all_attachments(service)

In [None]:
# Folder setup
input_dir = "gmail_attachments"  # Your folder with DOCX files
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

target_prefix = "Results list for_50___"

# Split articles
def split_articles_by_marker(paragraphs):
    articles = []
    current_article = []
    for para in paragraphs:
        current_article.append(para)
        if "End of Document" in para:
            articles.append(current_article)
            current_article = []
    if current_article:
        articles.append(current_article)
    return articles

# extract fields from a single article block
def extract_article_fields(article):
    fields = {
        "headline": "", "source": "", "date": "", "copyright": "",
        "section": "", "length": "", "byline": "",
        "body": "", "notes": "", "load_date": ""
    }
    body_lines, note_lines = [], []
    in_body, in_notes = False, False
    header_lines = []

    for line in article:
        if line == "Body":
            in_body, in_notes = True, False
            continue
        elif line == "Notes":
            in_body, in_notes = False, True
            continue
        elif "End of Document" in line:
            in_body, in_notes = False, False
            continue
        elif line.startswith("Load-Date:"):
            fields["load_date"] = line.replace("Load-Date:", "").strip()
            continue
        elif line.startswith("Section:"):
            fields["section"] = line.replace("Section:", "").strip()
            continue
        elif line.startswith("Length:"):
            fields["length"] = line.replace("Length:", "").strip()
            continue
        elif line.startswith("Byline:"):
            fields["byline"] = line.replace("Byline:", "").strip()
            continue
        elif line.startswith("Copyright"):
            fields["copyright"] += line + " "
            continue

        if in_body:
            body_lines.append(line)
        elif in_notes:
            note_lines.append(line)
        else:
            header_lines.append(line)

    if header_lines:
        fields["headline"] = header_lines[0]
        source_lines = []
        date_line = ""
        for line in header_lines[1:]:
            if re.search(r"[A-Za-z]+\s+\d{1,2},\s*\d{4}", line):
                date_line = line
                break
            else:
                source_lines.append(line)
        fields["source"] = " ".join(source_lines)
        fields["date"] = date_line

    fields["body"] = "\n".join(body_lines)
    fields["notes"] = "\n".join(note_lines)
    return fields

# Process all DOCX files in the input directory
all_data = []
for filename in os.listdir(input_dir):
    if filename.startswith(target_prefix) and filename.lower().endswith(".docx"):
        file_path = os.path.join(input_dir, filename)
        doc = Document(file_path)
        paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
        articles = split_articles_by_marker(paragraphs)
        structured = [extract_article_fields(a) for a in articles]
        for record in structured:
            record["source_file"] = filename
        all_data.extend(structured)

# Save all data to CSV
df = pd.DataFrame(all_data)
csv_path = os.path.join(output_dir, "all_articles_combined.csv")
df.to_csv(csv_path, index=False)
print(f"CSV saved to: {csv_path}")

✅ CSV saved to: output/all_articles_combined.csv


In [None]:
# Optioanlly, do some basic data quality checks
# Basic data quality examination
print(f"Dataset shape: {df.shape}")

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values)

# Check for empty strings
empty_strings = (df == '').sum()
print("\nEmpty strings per column:")
print(empty_strings)

# Check date formats and extract year
# Try to extract years from the date column
years = []
for date_str in df['date']:
    match = re.search(r'\b(19|20)\d{2}\b', date_str)
    if match:
        years.append(match.group(0))
    else:
        years.append(None)

# Count of articles by year
year_counts = Counter([y for y in years if y])
print("\nArticles by year:")
for year, count in sorted(year_counts.items()):
    print(f"{year}: {count}")

# Check length column - convert to numeric if possible
def extract_word_count(length_str):
    match = re.search(r'(\d+)\s*words', length_str)
    if match:
        return int(match.group(1))
    return None

word_counts = [extract_word_count(length) for length in df['length']]
word_counts = [wc for wc in word_counts if wc is not None]

if word_counts:
    print(f"\nWord count statistics:")
    print(f"Min: {min(word_counts)}")
    print(f"Max: {max(word_counts)}")
    print(f"Average: {sum(word_counts)/len(word_counts):.1f}")
    print(f"Articles with word count: {len(word_counts)}")

# Check source distribution
top_sources = df['source'].value_counts().head(10)
print("\nTop 10 sources:")
print(top_sources)