# Wikipedia scrapping

## List of battles

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin
import time
import random

BASE_URL = "https://en.wikipedia.org"
MAIN_PAGE = urljoin(BASE_URL, "/wiki/List_of_battles_(alphabetical)")
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    )
}

def extract_year(text):
    """Extract a year from text with improved matching for various formats."""
    if not isinstance(text, str):
        return None
        
    # Standard year pattern (1000-2029)
    match = re.search(r'\b(\d{3,4})\b(?!\s*BCE|\s*BC)', text)
    if match:
        return int(match.group(1))
    
    # Check for year ranges like "1495–96" or "1914-18"
    range_match = re.search(r'\b(\d{3,4})[–\-](\d{2,4})\b', text)
    if range_match:
        return int(range_match.group(1))
    
    # Last resort: any 3-4 digit number that might be a year
    digits_match = re.search(r'\b(\d{3,4})\b', text)
    if digits_match:
        year = int(digits_match.group(1))
        # Basic validation (between 500 and 2030)
        if 500 <= year <= 2030:
            return year
            
    return None

def clean_description(text):
    """Clean up description text by removing weird characters and formatting."""
    if not isinstance(text, str):
        return ""
    
    # Replace various dashes with standard hyphen
    cleaned = text.replace('–', '-').replace('\u2013', '-').replace('\u2014', '-')
    
    # Remove any HTML entities
    cleaned = re.sub(r'&[a-zA-Z]+;', ' ', cleaned)
    
    # Remove escape sequences
    cleaned = re.sub(r'\\[a-z]', '', cleaned)
    
    # Normalize whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    return cleaned

def standardize_conflict_name(conflict_text, year=None):
    """
    Standardize various ways of writing World War I and World War II in the conflict column.
    Also correct World War I references for battles within WWII period to World War II.
    
    Args:
        conflict_text (str): The original conflict text
        year (int, optional): The year of the battle, used for correction
        
    Returns:
        str: Standardized conflict text
    """
    if not isinstance(conflict_text, str) or not conflict_text.strip():
        return conflict_text
    
    # First, fix any World War III+ issues
    if "World War III" in conflict_text or "World War IIII" in conflict_text or "World War IIIII" in conflict_text:
        conflict_text = "World War II"
        return conflict_text
    
    # Determine if this is WWI or WWII
    is_ww2 = False
    is_ww1 = False
    
    # Check if year is within WWII period (1939-1945)
    if year is not None and 1939 <= year <= 1945:
        is_ww2 = True
    # Check if year is within WWI period (1914-1918)
    elif year is not None and 1914 <= year <= 1918:
        is_ww1 = True
    
    # Look for WWII indicators
    ww2_indicators = ["world war 2", "ww2", "wwii", "second world war", 
                     "2nd world war", "world war two", "world war ii"]
    
    for indicator in ww2_indicators:
        if indicator in conflict_text.lower():
            is_ww2 = True
            break
    
    # Look for WWI indicators
    ww1_indicators = ["world war 1", "ww1", "wwi", "first world war", 
                     "1st world war", "world war one", "great war", "world war i"]
    
    for indicator in ww1_indicators:
        if indicator in conflict_text.lower():
            is_ww1 = True
            break
    
    # Apply rule: if any WWII indicator or year is in WWII period, it's WWII
    if is_ww2:
        return "World War II"
    
    # If it has a WWI indicator and isn't WWII, it's WWI
    if is_ww1:
        return "World War I"
    
    # No world war references found
    return conflict_text

def extract_conflict(desc, year):
    """Extract the conflict/war information from the description."""
    if not year or not isinstance(desc, str):
        return None
    
    # Look for text that appears after the year
    year_str = str(int(year))  # Convert to string without decimal
    if year_str in desc:
        parts = desc.split(year_str, 1)
        if len(parts) > 1:
            after_year = parts[1].strip()
            # Remove any leading dashes or hyphens
            after_year = re.sub(r'^[-–—\s]+', '', after_year)
            
            # Standardize World War references with year correction
            after_year = standardize_conflict_name(after_year, year)
            
            return after_year if after_year else None
    
    # Alternative approach: look for common conflict prefixes
    prefixes = [
        "part of", "during", "in the", "related to", "world war", 
        "civil war", "napoleonic", "crusade", "invasion of"
    ]
    
    for prefix in prefixes:
        if prefix in desc.lower():
            index = desc.lower().find(prefix)
            if index > 0:
                conflict = desc[index:].strip()
                
                # Standardize World War references with year correction
                conflict = standardize_conflict_name(conflict, year)
                
                return conflict
    
    return None

def get_alternate_battle_pages():
    """Find other pages that list battles."""
    print(f"Checking for battle list pages from: {MAIN_PAGE}")
    response = requests.get(MAIN_PAGE, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to fetch main page: {MAIN_PAGE}")
        return []
    
    soup = BeautifulSoup(response.content, "html.parser")
    pages = []
    
    # Only include pages that are likely to contain actual battle lists
    # Exclude navigation and edit pages
    excluded_terms = ["action=edit", "action=history", "Special:", "Talk:", 
                      "oldid=", "printable=", "toggle_view_mobile", "action=info"]
    
    # Method 1: Look for links to categorized battle lists
    for a in soup.find_all("a"):
        href = a.get("href", "")
        text = a.text.strip()
        
        # Only include links that look like battle list pages
        # and exclude navigation/action links
        if (href and "List_of_battles" in href and 
            href != "/wiki/List_of_battles_(alphabetical)" and
            ("by_location" in href or "chronological" in href) and
            not any(term in href for term in excluded_terms)):
            
            url = urljoin(BASE_URL, href)
            pages.append((url, text))
            print(f"Found battle list page: {text} -> {url}")
    
    # Method 2: Check for links in the See Also section
    see_also = None
    for h2 in soup.find_all("h2"):
        span = h2.find("span", class_="mw-headline")
        if span and "See also" in span.text:
            see_also = h2
            break
    
    if see_also:
        next_elem = see_also.next_sibling
        while next_elem and next_elem.name != "h2":
            if next_elem.name == "ul":
                for li in next_elem.find_all("li"):
                    a = li.find("a")
                    if a and "List_of_battles" in a.get("href", ""):
                        href = a.get("href", "")
                        # Skip navigation/edit links
                        if any(term in href for term in excluded_terms):
                            continue
                            
                        url = urljoin(BASE_URL, href)
                        text = a.text.strip()
                        if (url, text) not in pages:
                            pages.append((url, text))
                            print(f"Found battle list page in 'See also': {text} -> {url}")
            next_elem = next_elem.next_sibling
    
    # Method 3: Check for letter-specific pages like "List_of_battles_(A)"
    # This rarely exists anymore, but check a few common ones
    letter_ranges = ["A", "B", "C", "S", "A-F", "G-M", "N-Z"]
    for letter in letter_ranges:
        letter_url = urljoin(BASE_URL, f"/wiki/List_of_battles_({letter})")
        try:
            r = requests.head(letter_url, headers=HEADERS)
            if r.status_code == 200:
                pages.append((letter_url, f"Battles {letter}"))
                print(f"Found letter-specific page: Battles {letter} -> {letter_url}")
                time.sleep(random.uniform(0.5, 1.0))  # Random delay to avoid rate limiting
        except Exception as e:
            print(f"Error checking for letter page {letter}: {e}")
    
    # Always include the main alphabetical page
    pages.append((MAIN_PAGE, "Main"))
    
    # Remove duplicates
    unique_pages = []
    seen_urls = set()
    for url, label in pages:
        if url not in seen_urls and not any(term in url for term in excluded_terms):
            seen_urls.add(url)
            unique_pages.append((url, label))
    
    print(f"Found {len(unique_pages)} unique battle list pages")
    return unique_pages

def process_list(ul, battles, section):
    """Process a list element and extract battles."""
    battle_count = 0
    
    # Skip very small lists that are likely navigation
    if len(ul.find_all("li")) < 2:
        return 0
        
    # Process all list items
    for li in ul.find_all("li", recursive=False):  # Only direct children
        a_tag = li.find("a")
        if not a_tag:
            continue
        
        battle_name = a_tag.text.strip()
        href = a_tag.get("href", "")
        
        # Skip non-wiki links, categories, lists, etc.
        if not href.startswith("/wiki/") or ":" in href or "List_of" in href:
            continue
            
        wiki_link = urljoin(BASE_URL, href)
        full_text = li.get_text(strip=True)
        
        # Skip very short texts that are likely not battles
        if len(full_text) < 10:
            continue
            
        # Clean the description text
        clean_text = clean_description(full_text)
        
        # Check if this looks like a battle
        is_battle = False
        
        # Check for battle-related terms in the text or link
        battle_terms = ["battle of", "siege of", "campaign", "offensive", "assault", 
                       "war of", "invasion of", "operation ", "attack on"]
        if any(term in battle_name.lower() for term in battle_terms) or \
           any(term in clean_text.lower() for term in battle_terms):
            is_battle = True
        
        # Extract year
        year = extract_year(clean_text)
        if year:
            is_battle = True
        
        # Extract conflict information
        conflict = extract_conflict(clean_text, year)
        
        if is_battle and "/wiki/" in wiki_link:
            # Check if this battle is already in our list
            is_duplicate = False
            for battle in battles:
                # Only consider it a duplicate if name AND year match
                # (or if the links are identical)
                same_name = battle["battle_name"] == battle_name
                same_link = battle["wiki_link"] == wiki_link
                same_year = battle["year"] == year
                
                if same_link or (same_name and same_year):
                    is_duplicate = True
                    break
            
            if not is_duplicate:
                battles.append({
                    "battle_name": battle_name,
                    "year": year,
                    "description": clean_text,
                    "conflict": conflict,
                    "wiki_link": wiki_link
                })
                battle_count += 1
    
    return battle_count

def scrape_battles_from_page(url, label):
    """Extract battles from a Wikipedia page."""
    print(f"Scraping battles from: {url}")
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to fetch page: {url}")
        return []
    
    soup = BeautifulSoup(response.content, "html.parser")
    content_div = soup.find("div", {"class": "mw-parser-output"})
    
    if not content_div:
        print(f"Could not find content on page: {url}")
        return []
    
    battles = []
    current_section = None
    
    # Keep track of section counts for debugging
    section_counts = {}
    
    # STRATEGY 1: Find all section headings (most likely the letter headings A, B, C, etc.)
    for element in content_div.find_all(['h2', 'h3', 'div']):
        span = None
        
        # Handle different heading elements
        if element.name in ['h2', 'h3']:
            span = element.find("span", {"class": "mw-headline"})
        elif element.name == 'div' and element.get('class') and 'mw-heading' in element.get('class'):
            span = element.find("span", {"class": "mw-headline"})
            
        if span:
            section_id = span.get("id", "")
            section_text = span.text.strip()
            
            # Check if it looks like a letter or letter range section
            if len(section_text) == 1 and section_text.isalpha():
                current_section = section_text
                print(f"Found section: {current_section}")
                section_counts[current_section] = 0
            elif re.match(r'^[A-Z][^a-z]*$', section_text) and len(section_text) < 10:
                # Matches letter ranges like "A-F" or similar
                current_section = section_text
                print(f"Found section: {current_section}")
                section_counts[current_section] = 0
            
            # Process lists that follow this heading
            if current_section:
                # Find all lists that follow this header until the next header
                next_elem = element.next_sibling
                
                # Skip text nodes and comments
                while next_elem and (not hasattr(next_elem, 'name') or not next_elem.name):
                    next_elem = next_elem.next_sibling
                
                # Look for all lists before the next heading
                while next_elem and not (hasattr(next_elem, 'name') and next_elem.name in ['h1', 'h2', 'h3']):
                    if hasattr(next_elem, 'name'):
                        # Direct UL element
                        if next_elem.name == 'ul':
                            battle_count = process_list(next_elem, battles, current_section)
                            section_counts[current_section] = section_counts.get(current_section, 0) + battle_count
                        
                        # List might be inside another container (div, p, etc.)
                        else:
                            for ul in next_elem.find_all('ul', recursive=True):
                                battle_count = process_list(ul, battles, current_section)
                                section_counts[current_section] = section_counts.get(current_section, 0) + battle_count
                    
                    next_elem = next_elem.next_sibling
    
    # STRATEGY 2: Look for battles in tables
    tables = content_div.find_all("table", {"class": "wikitable"})
    for table in tables:
        # Skip tables that are clearly navigation or metadata
        if any(cls in table.get("class", []) for cls in ["infobox", "navbox", "vertical-navbox"]):
            continue
            
        # Check column structure to determine if it's a battle table
        headers = []
        header_row = table.find("tr")
        if header_row:
            headers = [th.get_text(strip=True).lower() for th in header_row.find_all(["th"])]
        
        is_battle_table = False
        if headers and any(keyword in " ".join(headers) for keyword in ["battle", "year", "date", "war", "conflict"]):
            is_battle_table = True
        
        if is_battle_table:
            print(f"Found battle table with columns: {headers}")
            
            # Process table rows
            for row in table.find_all("tr")[1:]:  # Skip header row
                cells = row.find_all(["td"])
                if len(cells) < 2:
                    continue
                
                # Try to determine which columns contain what information
                battle_name = None
                year_text = None
                conflict_text = None
                
                # First cell usually has the battle name
                first_cell = cells[0]
                a_tag = first_cell.find("a")
                if a_tag:
                    battle_name = a_tag.text.strip()
                    wiki_link = urljoin(BASE_URL, a_tag.get("href", ""))
                else:
                    continue  # Skip if no link found
                
                # Look for year in date column or second column
                if len(cells) > 1:
                    year_text = cells[1].get_text(strip=True)
                
                # Look for conflict in additional columns
                if len(cells) > 2:
                    conflict_text = cells[2].get_text(strip=True)
                
                # Combine all cells for description
                full_text = " ".join(cell.get_text(strip=True) for cell in cells)
                clean_text = clean_description(full_text)
                
                # Extract year if not found in specific column
                year = extract_year(year_text) if year_text else extract_year(clean_text)
                
                # Extract conflict if not found in specific column
                conflict = None
                if conflict_text:
                    # Clean and standardize the conflict text
                    conflict = clean_description(conflict_text)
                    conflict = standardize_conflict_name(conflict, year)
                else:
                    conflict = extract_conflict(clean_text, year)
                
                # Add to battles list
                if "/wiki/" in wiki_link and "List_of" not in wiki_link:
                    # Check if this is a duplicate battle
                    is_duplicate = False
                    for battle in battles:
                        # Only consider it a duplicate if name AND year match
                        # (or if the links are identical)
                        same_name = battle["battle_name"] == battle_name
                        same_link = battle["wiki_link"] == wiki_link
                        same_year = battle["year"] == year
                        
                        if same_link or (same_name and same_year):
                            is_duplicate = True
                            break
                    
                    if not is_duplicate:
                        battles.append({
                            "battle_name": battle_name,
                            "year": year,
                            "description": clean_text,
                            "conflict": conflict,
                            "wiki_link": wiki_link
                        })
    
    # Print section counts (for debugging)
    if section_counts:
        print("Battles found by section:")
        total_section_battles = 0
        for section, count in sorted(section_counts.items()):
            print(f"  {section}: {count} battles")
            total_section_battles += count
        print(f"  Total from sections: {total_section_battles}")
    
    # STRATEGY 3: Fallback - look for any lists with battle-like entries
    # Only run this if we found very few battles with normal methods
    # or if we're missing battles from some sections
    if len(battles) < 200 or len(section_counts) < 20:
        print("Using fallback approach to find additional battles...")
        
        # Process all UL elements anywhere in the page
        battle_count_before = len(battles)
        for ul in content_div.find_all("ul"):
            battle_count = process_list(ul, battles, "fallback")
        
        print(f"Fallback method found {len(battles) - battle_count_before} additional battles")
    
    print(f"Total battles found on this page: {len(battles)}")
    return battles

def main():
    try:
        all_battles = []
        
        # We'll just focus on the main page since that contains everything
        print(f"Scraping battles from main page: {MAIN_PAGE}")
        main_battles = scrape_battles_from_page(MAIN_PAGE, "Main")
        all_battles.extend(main_battles)
        
        # Check some alternative pages only if we found few battles
        if len(all_battles) < 500:
            print("Found relatively few battles on main page, checking alternate pages...")
            alt_pages = get_alternate_battle_pages()
            
            # If we found alternate pages, scrape them
            if alt_pages:
                for url, label in alt_pages:
                    battles = scrape_battles_from_page(url, label)
                    all_battles.extend(battles)
                    # Random delay to avoid rate limiting
                    time.sleep(random.uniform(1, 2))
        
        # If we still found nothing, print a debug message
        if not all_battles:
            print("\n⚠️ No battles found! The page structure might have changed.")
            print("Here's some information about the main page structure:")
            
            response = requests.get(MAIN_PAGE, headers=HEADERS)
            soup = BeautifulSoup(response.content, "html.parser")
            
            print(f"Page title: {soup.title.text if soup.title else 'No title'}")
            print("Headings found:")
            for i, h in enumerate(soup.find_all(['h1', 'h2', 'h3'])[:10]):
                print(f"{i+1}. {h.name}: {h.get_text(strip=True)}")
            
            return
        
        print(f"\nTotal battles found across all pages: {len(all_battles)}")
        
        # Create a DataFrame and handle duplicates
        df = pd.DataFrame(all_battles)
        print(f"Initial data: {len(df)} battles")
        
        # Count before deduplication
        battle_counts = df.groupby(['battle_name', 'year']).size().reset_index(name='count')
        duplicate_battles = battle_counts[battle_counts['count'] > 1]
        
        if len(duplicate_battles) > 0:
            print(f"{len(duplicate_battles)} battle name+year combinations appear multiple times")
            print("Examples: ")
            for i, row in duplicate_battles.head(5).iterrows():
                print(f"  - {row['battle_name']} ({row['year']}): {row['count']} occurrences")
        
        # Remove duplicates by battle name AND year (to keep battles with same name but different years)
        df_cleaned = df.drop_duplicates(subset=["battle_name", "year", "wiki_link"]).copy()
        print(f"After deduplication: {len(df_cleaned)} unique battles")
        
        # Post-processing: Standardize World War references in all conflicts
        print("Standardizing World War references in conflicts...")
        
        # Test case for "World War IIIII" issue
        print("Checking for problematic World War references...")
        problematic_refs = df_cleaned['conflict'].str.contains('World War III|World War IIII|World War IIIII', 
                                                              regex=True, 
                                                              case=True, 
                                                              na=False)
        if problematic_refs.sum() > 0:
            print(f"Found {problematic_refs.sum()} problematic 'World War III+' references that need fixing")
        
        # Apply year-based correction to ensure battles are properly classified
        def apply_standardization(row):
            if pd.isna(row['conflict']):
                return ''
            return standardize_conflict_name(row['conflict'], row['year'])
        
        df_cleaned['conflict'] = df_cleaned.apply(apply_standardization, axis=1)
        
        # Count standardized World War references 
        ww1_count = df_cleaned['conflict'].str.contains('^World War I$', regex=True, case=True, na=False).sum()
        ww2_count = df_cleaned['conflict'].str.contains('^World War II$', regex=True, case=True, na=False).sum()
        print(f"Found {ww1_count} battles from World War I and {ww2_count} battles from World War II")
        
        # Final check for any remaining issues
        remaining_issues = df_cleaned['conflict'].str.contains('World War III|World War IIII|World War IIIII', 
                                                              regex=True, 
                                                              case=True, 
                                                              na=False)
        if remaining_issues.sum() > 0:
            print(f"Warning: Still found {remaining_issues.sum()} problematic World War references")
            print("Applying final correction...")
            
            # Manual fix for any remaining issues
            df_cleaned['conflict'] = df_cleaned['conflict'].str.replace('World War IIIII', 'World War II')
            df_cleaned['conflict'] = df_cleaned['conflict'].str.replace('World War IIII', 'World War II')
            df_cleaned['conflict'] = df_cleaned['conflict'].str.replace('World War III', 'World War II')
        
        # Check for any potential misclassifications between WWI and WWII
        potential_errors = df_cleaned[(df_cleaned['year'] >= 1939) & (df_cleaned['year'] <= 1945) & 
                                      (df_cleaned['conflict'] == 'World War I')]
        
        if len(potential_errors) > 0:
            print(f"Warning: Found {len(potential_errors)} battles during WWII period (1939-1945) still labeled as World War I")
            print("These will be corrected...")
            
            # Force correct WWII period battles
            df_cleaned.loc[(df_cleaned['year'] >= 1939) & (df_cleaned['year'] <= 1945), 'conflict'] = df_cleaned.loc[
                (df_cleaned['year'] >= 1939) & (df_cleaned['year'] <= 1945), 'conflict'].apply(
                    lambda x: 'World War II' if x == 'World War I' else x
                )
        
        # Check and fix year data
        year_count = df_cleaned['year'].notna().sum()
        print(f"Battles with year information: {year_count} ({year_count/len(df_cleaned)*100:.1f}%)")
        
        # Reorder columns to a logical order and add battle_id as primary key
        df_cleaned.reset_index(drop=True, inplace=True)
        df_cleaned['battle_id'] = df_cleaned.index + 1  # Start IDs from 1
        column_order = ["battle_id", "battle_name", "year", "description", "conflict", "wiki_link"]
        df_cleaned = df_cleaned[column_order]
        
        # Save to CSV
        df_cleaned.to_csv("battles_list.csv", index=False)
        
        print(f"\n✅ Scraped {len(df_cleaned)} unique battles. Saved to 'battles_list.csv'.")
        
        # Show some sample data
        print("\nSample of scraped battles:")
        for i, row in df_cleaned.head(5).iterrows():
            print(f"{i+1}. {row['battle_name']} ({row['year'] if pd.notna(row['year']) else 'Year unknown'})")
            if row['conflict'] and pd.notna(row['conflict']):
                print(f"   Conflict: {row['conflict']}")
            print(f"   Link: {row['wiki_link']}")
    
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Scraping battles from main page: https://en.wikipedia.org/wiki/List_of_battles_(alphabetical)
Scraping battles from: https://en.wikipedia.org/wiki/List_of_battles_(alphabetical)
Using fallback approach to find additional battles...
Fallback method found 962 additional battles
Total battles found on this page: 962

Total battles found across all pages: 962
Initial data: 962 battles
After deduplication: 962 unique battles
Standardizing World War references in conflicts...
Checking for problematic World War references...
Found 58 battles from World War I and 87 battles from World War II
Battles with year information: 883 (91.8%)

✅ Scraped 962 unique battles. Saved to 'battles_list.csv'.

Sample of scraped battles:
1. Battle of Aachen (1944.0)
   Conflict: World War II
   Link: https://en.wikipedia.org/wiki/Battle_of_Aachen
2. Battle of Abensberg (1809.0)
   Conflict: Napoleonic Wars
   Link: https://en.wikipedia.org/wiki/Battle_of_Abensberg
3. Battle of Abbeville (1940.0)
   Conflict: Wo

## Battle info

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
import os.path
from urllib.parse import urljoin

# Constants
BASE_URL = "https://en.wikipedia.org"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    )
}

def load_battles_csv(filename="battles_list.csv"):
    """Load the battles CSV file and return as DataFrame."""
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Battles CSV file not found: {filename}")
    
    battles_df = pd.read_csv(filename)
    print(f"Loaded {len(battles_df)} battles from {filename}")
    return battles_df

def extract_number(text, keywords=None):
    """
    Extract number from text, with optional keyword context.
    If keywords provided, looks for numbers near the keywords.
    """
    if not isinstance(text, str):
        return None
    
    text = text.replace('\xa0', ' ')  # Handle non-breaking spaces
    
    # Default keywords if none provided
    if keywords is None:
        keywords = ['men', 'troops', 'soldiers', 'personnel']
    elif isinstance(keywords, str):
        keywords = [keywords]
    
    # Try each keyword
    for keyword in keywords:
        if keyword in text.lower():
            # Find position of keyword
            pos = text.lower().find(keyword)
            
            # Look for numbers before the keyword (within 50 characters)
            before_text = text[max(0, pos-50):pos]
            number_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?|\d+)', before_text)
            
            # If found numbers before keyword, use the closest one (last one)
            if number_matches:
                num_str = number_matches[-1].replace(',', '')
                try:
                    return int(float(num_str))
                except ValueError:
                    pass
            
            # Look for numbers after keyword (within 20 characters)
            after_text = text[pos + len(keyword):min(len(text), pos + len(keyword) + 20)]
            number_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?|\d+)', after_text)
            
            # If found numbers after keyword, use the closest one (first one)
            if number_matches:
                num_str = number_matches[0].replace(',', '')
                try:
                    return int(float(num_str))
                except ValueError:
                    pass
    
    # If no numbers found near keywords, look for number ranges (e.g., "10,000-15,000")
    range_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+)(?:\s*[-–]\s*)(\d{1,3}(?:,\d{3})+|\d+)', text)
    if range_matches:
        try:
            # Use average of the range
            first_num = int(range_matches[0][0].replace(',', ''))
            second_num = int(range_matches[0][1].replace(',', ''))
            return (first_num + second_num) // 2
        except ValueError:
            pass
    
    # Look for any number with specific formats (prioritize larger numbers which are more likely troop counts)
    # Focus on numbers that appear to be formatted as thousands (e.g., "10,000")
    formatted_thousands = re.findall(r'(\d{1,3},\d{3})', text)
    if formatted_thousands:
        try:
            return int(formatted_thousands[0].replace(',', ''))
        except ValueError:
            pass
    
    # Look for any large numbers (3+ digits)
    large_numbers = re.findall(r'(\d{3,})', text)
    if large_numbers:
        try:
            return int(large_numbers[0])
        except ValueError:
            pass
    
    # Last resort: any number
    number_matches = re.findall(r'(\d+)', text)
    if number_matches:
        for match in number_matches:
            try:
                num = int(match)
                # Only consider if it's a reasonably sized number for troop counts
                if num > 50:  # Avoid small numbers that might be dates or footnotes
                    return num
            except ValueError:
                continue
    
    return None

def clean_country_name(text):
    """Clean country name text."""
    if not isinstance(text, str):
        return None
    
    # Remove text in brackets, parentheses
    cleaned = re.sub(r'\([^)]*\)', '', text)
    cleaned = re.sub(r'\[[^\]]*\]', '', cleaned)
    
    # Remove footnote references and citation needed
    cleaned = re.sub(r'\[\d+\]', '', cleaned)
    cleaned = re.sub(r'\[citation needed\]', '', cleaned)
    
    # Remove excess whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # Skip phrases that are clearly results, not countries
    result_phrases = [
        'victory', 'defeat', 'decisive', 'inconclusive', 'stalemate', 
        'draw', 'tactical', 'strategic', 'pyrrhic', 'truce', 'ceasefire'
    ]
    if any(phrase in cleaned.lower() for phrase in result_phrases) and len(cleaned.split()) <= 3:
        return None
    
    # Remove very short strings or just numbers or dates
    if len(cleaned) < 3 or cleaned.isdigit() or re.match(r'^\d+\s+\w+$', cleaned):
        return None
    
    # Replace some common abbreviations
    replacements = {
        'US': 'United States',
        'U.S.': 'United States',
        'U.S.A.': 'United States',
        'USA': 'United States',
        'UK': 'United Kingdom',
        'U.K.': 'United Kingdom',
        'Soviet Union': 'Russia',
        'USSR': 'Russia',
        'U.S.S.R.': 'Russia'
    }
    
    for abbr, full in replacements.items():
        if cleaned == abbr:
            cleaned = full
    
    return cleaned

def extract_html_content(element):
    """Extract a clean text representation of HTML content including flags and formatting."""
    if element is None:
        return ""
    
    # For simple text elements, just return the text
    if isinstance(element, str):
        return element.strip()
    
    # Create a list to store the content parts
    content_parts = []
    
    # Process all child elements
    for child in element.children:
        if child.name is None:  # Text node
            text = child.strip()
            if text:
                content_parts.append(text)
        elif child.name == 'br':
            content_parts.append('\n')
        elif child.name == 'li':
            content_parts.append('\n• ' + extract_html_content(child))
        elif child.name in ['b', 'strong', 'i', 'em', 'span', 'div', 'p']:
            content_parts.append(extract_html_content(child))
        elif child.name == 'a':
            content_parts.append(child.get_text().strip())
        elif child.name == 'ul' or child.name == 'ol':
            for li in child.find_all('li', recursive=False):
                content_parts.append('\n• ' + extract_html_content(li))
    
    # Join all the parts together
    return ' '.join(content_parts).strip()

def extract_countries_from_text(content):
    """Extract country names from text content with better handling of separators."""
    countries = []
    
    # Split by common separators
    parts = re.split(r'[,\n•]|\band\b', content)
    
    for part in parts:
        country = clean_country_name(part)
        if country and country not in countries and len(country) > 3:
            countries.append(country)
    
    return countries

def split_content_by_flags(content):
    """
    Split HTML content that may contain flags and country references.
    This improves extraction when countries are arranged with flag icons.
    """
    if not content:
        return []
    
    # Check for list items
    if '•' in content:
        parts = [p.strip() for p in content.split('•') if p.strip()]
        return parts
    
    # Split by commas and newlines
    parts = re.split(r'[,\n]+', content)
    return [p.strip() for p in parts if p.strip()]

def determine_battle_result(result_text, side1_countries, side2_countries):
    """Determine the battle result with improved pattern matching."""
    if not result_text:
        return "unknown", "unknown"
    
    result_lower = result_text.lower()
    
    # Check for ties
    if any(term in result_lower for term in ["stalemate", "draw", "inconclusive", "indecisive", "truce", "ceasefire"]):
        return "tied", "tied"
    
    # Check for side 1 victory
    side1_victory = any(term in result_lower for term in ["allied victory", "american victory", "british victory", "french victory"])
    for country in side1_countries:
        if country.lower() in result_lower and "victory" in result_lower:
            side1_victory = True
            break
    
    # Check for side 2 victory
    side2_victory = any(term in result_lower for term in ["german victory", "axis victory", "japanese victory", "confederate victory"])
    for country in side2_countries:
        if country.lower() in result_lower and "victory" in result_lower:
            side2_victory = True
            break
    
    # Determine results based on the checks
    if side1_victory and not side2_victory:
        return "winner", "loser"
    elif side2_victory and not side1_victory:
        return "loser", "winner"
    
    # Check for specific defeat mentions
    if "defeat" in result_lower:
        for country in side1_countries:
            if country.lower() in result_lower and "defeat" in result_lower:
                return "loser", "winner"
        for country in side2_countries:
            if country.lower() in result_lower and "defeat" in result_lower:
                return "winner", "loser"
    
    # Default to unknown if we can't determine
    return "unknown", "unknown"

def extract_infobox_data(infobox, key):
    """
    Extract data from infobox based on a key (header text)
    with improved handling of infobox structures.
    """
    if not infobox:
        return []
    
    data = []
    found_header = False
    
    # First try to find a row with the key in the header
    for row in infobox.find_all("tr"):
        if not found_header:
            # Check if this is a header row with our key
            th = row.find("th")
            if th and key.lower() in th.get_text().strip().lower():
                found_header = True
                # Some infoboxes include data in the same row as the header
                td = row.find("td")
                if td:
                    data.append(td)
        else:
            # After finding the header, collect subsequent data rows until we hit another header
            th = row.find("th")
            if th:
                # If the new header doesn't contain our key, we've moved to a new section
                if key.lower() not in th.get_text().strip().lower():
                    break
            
            # Add any data cells
            for td in row.find_all("td"):
                data.append(td)
    
    # If standard approach failed, try to find rows with specific structure
    if not data:
        # Look for row format: belligerent1, belligerent2
        for row in infobox.find_all("tr"):
            ths = row.find_all("th")
            tds = row.find_all("td")
            
            # Check if this is a row with key in any header and multiple data cells
            if any(key.lower() in th.get_text().strip().lower() for th in ths) and len(tds) >= 2:
                data = tds
                break
    
    return data

def scrape_battle_page(battle_id, battle_name, wiki_link):
    """Scrape an individual battle page for country participant information."""
    print(f"Scraping battle page: {battle_name} (ID: {battle_id})")
    
    # Check if the wiki_link is already a full URL
    if not wiki_link.startswith("http"):
        url = urljoin(BASE_URL, wiki_link)
    else:
        url = wiki_link
    
    participants = []
    
    try:
        # Fetch the page
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to fetch page: {url} (Status: {response.status_code})")
            return participants
        
        # Parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find the infobox (try multiple classes)
        infobox = soup.find("table", {"class": ["infobox", "vevent", "vcard"]})
        if not infobox:
            print(f"No infobox found for {battle_name} at {url}")
            return participants
        
        # Extract result first to determine overall battle outcome
        result_text = ""
        result_data = extract_infobox_data(infobox, "result")
        if result_data:
            result_text = extract_html_content(result_data[0])
            print(f"Result: {result_text[:100]}...")
        
        # Extract belligerents
        belligerent_data = extract_infobox_data(infobox, "belligerent") or extract_infobox_data(infobox, "combatant")
        
        side1_countries = []
        side2_countries = []
        
        if len(belligerent_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(belligerent_data[0])
            print(f"Side 1 raw content: {side1_content[:100]}...")
            
            # Extract countries with improved extraction
            if side1_content:
                # First try to extract from list items if present
                parts = split_content_by_flags(side1_content)
                if parts:
                    for part in parts:
                        country = clean_country_name(part)
                        if country and country not in side1_countries:
                            side1_countries.append(country)
                else:
                    # Try extracting from the full content
                    extracted = extract_countries_from_text(side1_content)
                    side1_countries.extend([c for c in extracted if c not in side1_countries])
            
            # Process Side 2
            side2_content = extract_html_content(belligerent_data[1])
            print(f"Side 2 raw content: {side2_content[:100]}...")
            
            # Extract countries with improved extraction
            if side2_content:
                # First try to extract from list items if present
                parts = split_content_by_flags(side2_content)
                if parts:
                    for part in parts:
                        country = clean_country_name(part)
                        if country and country not in side2_countries:
                            side2_countries.append(country)
                else:
                    # Try extracting from the full content
                    extracted = extract_countries_from_text(side2_content)
                    side2_countries.extend([c for c in extracted if c not in side2_countries])
        
        # If no countries found, try harder with alternative approaches
        if not side1_countries and not side2_countries:
            # Get all text from the infobox
            infobox_text = infobox.get_text()
            
            # Try to identify common country names in the text
            common_countries = [
                "United States", "Germany", "France", "United Kingdom", "Great Britain",
                "Soviet Union", "Russia", "Japan", "Italy", "China", "Spain", "Austria",
                "Confederate States", "Union", "Prussia", "Ottoman Empire", "Turkey"
            ]
            
            found_countries = []
            for country in common_countries:
                if country in infobox_text:
                    found_countries.append(country)
            
            # If we found some countries, try to split them into sides
            if len(found_countries) >= 2:
                # Common historical pairings to guess sides
                opposing_pairs = [
                    (["United States", "United Kingdom", "France", "Russia", "Soviet Union"], 
                     ["Germany", "Japan", "Italy"]),
                    (["Union"], ["Confederate States"]),
                    (["United Kingdom", "Prussia", "Russia"], ["France"]),
                    (["United Kingdom", "France"], ["Spain"]),
                    (["Austria", "Prussia", "Russia"], ["Ottoman Empire", "Turkey"])
                ]
                
                for side1_candidates, side2_candidates in opposing_pairs:
                    side1_matches = [c for c in found_countries if c in side1_candidates]
                    side2_matches = [c for c in found_countries if c in side2_candidates]
                    
                    if side1_matches and side2_matches:
                        side1_countries = side1_matches
                        side2_countries = side2_matches
                        break
                
                # If we couldn't assign sides with pairs, just split the found countries
                if not side1_countries and not side2_countries and len(found_countries) >= 2:
                    middle = len(found_countries) // 2
                    side1_countries = found_countries[:middle]
                    side2_countries = found_countries[middle:]
        
        print(f"Side 1 countries: {side1_countries}")
        print(f"Side 2 countries: {side2_countries}")
        
        # Extract strength information
        strength_data = extract_infobox_data(infobox, "strength")
        
        side1_troops = None
        side2_troops = None
        
        if len(strength_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(strength_data[0])
            print(f"Side 1 strength raw: {side1_content[:100]}...")
            
            # Extract troop numbers with improved extraction
            if side1_content:
                # Try multiple keywords for troop counts
                keywords = ['men', 'troops', 'soldiers', 'personnel', 'combatants', 'infantry', 'army']
                side1_troops = extract_number(side1_content, keywords)
            
            # Process Side 2
            side2_content = extract_html_content(strength_data[1])
            print(f"Side 2 strength raw: {side2_content[:100]}...")
            
            # Extract troop numbers with improved extraction
            if side2_content:
                # Try multiple keywords for troop counts
                keywords = ['men', 'troops', 'soldiers', 'personnel', 'combatants', 'infantry', 'army']
                side2_troops = extract_number(side2_content, keywords)
        
        print(f"Side 1 troops: {side1_troops}")
        print(f"Side 2 troops: {side2_troops}")
        
        # Extract casualty information
        casualty_data = extract_infobox_data(infobox, "casualt") or extract_infobox_data(infobox, "loss")
        
        side1_deaths = None
        side2_deaths = None
        
        if len(casualty_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(casualty_data[0])
            print(f"Side 1 casualties raw: {side1_content[:100]}...")
            
            # Extract death numbers with improved extraction
            if side1_content:
                # Try multiple keywords for death counts
                keywords = ['killed', 'dead', 'deaths', 'fatalities', 'kia', 'killed in action']
                side1_deaths = extract_number(side1_content, keywords)
                
                # If no deaths found with keywords, try extracting other casualty numbers
                if side1_deaths is None:
                    side1_deaths = extract_number(side1_content, ['casualties', 'wounded', 'injured'])
            
            # Process Side 2
            side2_content = extract_html_content(casualty_data[1])
            print(f"Side 2 casualties raw: {side2_content[:100]}...")
            
            # Extract death numbers with improved extraction
            if side2_content:
                # Try multiple keywords for death counts
                keywords = ['killed', 'dead', 'deaths', 'fatalities', 'kia', 'killed in action']
                side2_deaths = extract_number(side2_content, keywords)
                
                # If no deaths found with keywords, try extracting other casualty numbers
                if side2_deaths is None:
                    side2_deaths = extract_number(side2_content, ['casualties', 'wounded', 'injured'])
        
        print(f"Side 1 deaths: {side1_deaths}")
        print(f"Side 2 deaths: {side2_deaths}")
        
        # Determine the result for each side based on the overall result
        side1_result, side2_result = determine_battle_result(result_text, side1_countries, side2_countries)
        
        print(f"Side 1 result: {side1_result}")
        print(f"Side 2 result: {side2_result}")
        
        # Add the participants
        for country in side1_countries:
            participants.append({
                "battle_id": battle_id,
                "country": country,
                "troops": side1_troops,
                "deaths": side1_deaths,
                "result": side1_result
            })
        
        for country in side2_countries:
            participants.append({
                "battle_id": battle_id,
                "country": country,
                "troops": side2_troops,
                "deaths": side2_deaths,
                "result": side2_result
            })
        
        print(f"Found {len(participants)} participants for {battle_name}")
        
        return participants
    
    except Exception as e:
        print(f"Error scraping {battle_name}: {e}")
        import traceback
        traceback.print_exc()
        return participants

def main():
    try:
        # Load the battles from the CSV file
        battles_df = load_battles_csv()
        
        # Create a list to store all participants
        all_participants = []
        
        # Process all battles or limit for testing
        max_battles = len(battles_df)  # Change this to limit the number of battles processed
        num_battles = min(len(battles_df), max_battles)
        
        print(f"Processing {num_battles} battles...")
        
        # Process each battle
        for index, battle in battles_df.head(num_battles).iterrows():
            battle_id = battle["battle_id"]
            battle_name = battle["battle_name"]
            wiki_link = battle["wiki_link"]
            
            # Skip if missing wiki link
            if pd.isna(wiki_link) or not wiki_link:
                print(f"Skipping battle {battle_name} (ID: {battle_id}) - Missing wiki link")
                continue
            
            # Scrape the battle page for country information
            participants = scrape_battle_page(battle_id, battle_name, wiki_link)
            
            # Add participants to the overall list
            all_participants.extend(participants)
            
            # Add a random delay to avoid rate limiting
            time.sleep(random.uniform(1.0, 3.0))
            
            # Print progress every 10 battles
            if (index + 1) % 10 == 0 or (index + 1) == num_battles:
                print(f"Progress: {index + 1}/{num_battles} battles processed ({((index + 1)/num_battles)*100:.1f}%)")
        
        # Create a DataFrame from the participants data
        if not all_participants:
            print("No participants found. Check the scraping logic.")
            return
        
        participants_df = pd.DataFrame(all_participants)
        
        # Add a unique participant_id as the primary key
        participants_df.reset_index(drop=True, inplace=True)
        participants_df['participant_id'] = participants_df.index + 1  # Start IDs from 1
        
        # Reorder columns to put participant_id first
        cols = ['participant_id', 'battle_id', 'country', 'troops', 'deaths', 'result']
        participants_df = participants_df[cols]
        
        # Save to CSV
        participants_df.to_csv("battle_info.csv", index=False)
        
        print(f"\n✅ Scraped {len(participants_df)} participant entries for {num_battles} battles.")
        print("Saved to 'battle_participants.csv'")
        
        # Print stats about data extraction
        troops_count = participants_df['troops'].notna().sum()
        deaths_count = participants_df['deaths'].notna().sum()
        result_count = participants_df[participants_df['result'] != "unknown"].shape[0]
        
        print(f"\nExtraction statistics:")
        print(f"- Participants with troop numbers: {troops_count} ({troops_count/len(participants_df)*100:.1f}%)")
        print(f"- Participants with death counts: {deaths_count} ({deaths_count/len(participants_df)*100:.1f}%)")
        print(f"- Participants with known results: {result_count} ({result_count/len(participants_df)*100:.1f}%)")
        
        # Show some sample data
        print("\nSample of participant data:")
        for i, row in participants_df.head(5).iterrows():
            print(f"{i+1}. Participant ID: {row['participant_id']}, Battle ID: {row['battle_id']}, Country: {row['country']}")
            print(f"   Troops: {row['troops'] if pd.notna(row['troops']) else 'Unknown'}")
            print(f"   Deaths: {row['deaths'] if pd.notna(row['deaths']) else 'Unknown'}")
            print(f"   Result: {row['result'] if pd.notna(row['result']) else 'Unknown'}")
    
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Loaded 962 battles from battles_list.csv
Processing 962 battles...
Scraping battle page: Battle of Aachen (ID: 1)
Result: American victory...
Side 1 raw content: United States...
Side 2 raw content: Germany...
Side 1 countries: ['United States']
Side 2 countries: ['Germany']
Side 1 strength raw: 100,000 soldiers...
Side 2 strength raw: 13,000 soldiers 
 5,000 Volkssturm...
Side 1 troops: 100000
Side 2 troops: 13000
Side 1 casualties raw: 7,000+ casualties 
 including 2,000 killed...
Side 2 casualties raw: 5,000 killed (estimate), 
 5,600 captured...
Side 1 deaths: 2000
Side 2 deaths: 5000
Side 1 result: winner
Side 2 result: loser
Found 2 participants for Battle of Aachen
Scraping battle page: Battle of Abensberg (ID: 2)
Result: French victory...
Side 1 raw content: First French Empire 
  Kingdom of Bavaria 
  Württemberg...
Side 2 raw content: Austrian Empire...
Side 1 countries: ['First French Empire', 'Kingdom of Bavaria', 'Württemberg']
Side 2 countries: ['Austrian Empire']
Side 1 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
import os.path
from urllib.parse import urljoin

# Constants
BASE_URL = "https://en.wikipedia.org"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    )
}

def load_battles_csv(filename="battles_list.csv"):
    """Load the battles CSV file and return as DataFrame."""
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Battles CSV file not found: {filename}")
    
    battles_df = pd.read_csv(filename)
    print(f"Loaded {len(battles_df)} battles from {filename}")
    return battles_df

def extract_number(text, keywords=None):
    """
    Extract number from text, with optional keyword context.
    If keywords provided, looks for numbers near the keywords.
    """
    if not isinstance(text, str):
        return None
    
    text = text.replace('\xa0', ' ')  # Handle non-breaking spaces
    
    # Default keywords if none provided
    if keywords is None:
        keywords = ['men', 'troops', 'soldiers', 'personnel']
    elif isinstance(keywords, str):
        keywords = [keywords]
    
    # Try each keyword
    for keyword in keywords:
        if keyword in text.lower():
            # Find position of keyword
            pos = text.lower().find(keyword)
            
            # Look for numbers before the keyword (within 50 characters)
            before_text = text[max(0, pos-50):pos]
            number_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?|\d+)', before_text)
            
            # If found numbers before keyword, use the closest one (last one)
            if number_matches:
                num_str = number_matches[-1].replace(',', '')
                try:
                    return int(float(num_str))
                except ValueError:
                    pass
            
            # Look for numbers after keyword (within 20 characters)
            after_text = text[pos + len(keyword):min(len(text), pos + len(keyword) + 20)]
            number_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?|\d+)', after_text)
            
            # If found numbers after keyword, use the closest one (first one)
            if number_matches:
                num_str = number_matches[0].replace(',', '')
                try:
                    return int(float(num_str))
                except ValueError:
                    pass
    
    # If no numbers found near keywords, look for number ranges (e.g., "10,000-15,000")
    range_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+)(?:\s*[-–]\s*)(\d{1,3}(?:,\d{3})+|\d+)', text)
    if range_matches:
        try:
            # Use average of the range
            first_num = int(range_matches[0][0].replace(',', ''))
            second_num = int(range_matches[0][1].replace(',', ''))
            return (first_num + second_num) // 2
        except ValueError:
            pass
    
    # Look for any number with specific formats (prioritize larger numbers which are more likely troop counts)
    # Focus on numbers that appear to be formatted as thousands (e.g., "10,000")
    formatted_thousands = re.findall(r'(\d{1,3},\d{3})', text)
    if formatted_thousands:
        try:
            return int(formatted_thousands[0].replace(',', ''))
        except ValueError:
            pass
    
    # Look for any large numbers (3+ digits)
    large_numbers = re.findall(r'(\d{3,})', text)
    if large_numbers:
        try:
            return int(large_numbers[0])
        except ValueError:
            pass
    
    # Last resort: any number
    number_matches = re.findall(r'(\d+)', text)
    if number_matches:
        for match in number_matches:
            try:
                num = int(match)
                # Only consider if it's a reasonably sized number for troop counts
                if num > 50:  # Avoid small numbers that might be dates or footnotes
                    return num
            except ValueError:
                continue
    
    return None

def clean_country_name(text):
    """Clean country name text."""
    if not isinstance(text, str):
        return None
    
    # Remove text in brackets, parentheses
    cleaned = re.sub(r'\([^)]*\)', '', text)
    cleaned = re.sub(r'\[[^\]]*\]', '', cleaned)
    
    # Remove footnote references and citation needed
    cleaned = re.sub(r'\[\d+\]', '', cleaned)
    cleaned = re.sub(r'\[citation needed\]', '', cleaned)
    
    # Remove excess whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # Skip phrases that are clearly results, not countries
    result_phrases = [
        'victory', 'defeat', 'decisive', 'inconclusive', 'stalemate', 
        'draw', 'tactical', 'strategic', 'pyrrhic', 'truce', 'ceasefire'
    ]
    if any(phrase in cleaned.lower() for phrase in result_phrases) and len(cleaned.split()) <= 3:
        return None
    
    # Remove very short strings or just numbers or dates
    if len(cleaned) < 3 or cleaned.isdigit() or re.match(r'^\d+\s+\w+$', cleaned):
        return None
    
    # Replace some common abbreviations
    replacements = {
        'US': 'United States',
        'U.S.': 'United States',
        'U.S.A.': 'United States',
        'USA': 'United States',
        'UK': 'United Kingdom',
        'U.K.': 'United Kingdom',
        'Soviet Union': 'Russia',
        'USSR': 'Russia',
        'U.S.S.R.': 'Russia'
    }
    
    for abbr, full in replacements.items():
        if cleaned == abbr:
            cleaned = full
    
    return cleaned

def extract_html_content(element):
    """Extract a clean text representation of HTML content including flags and formatting."""
    if element is None:
        return ""
    
    # For simple text elements, just return the text
    if isinstance(element, str):
        return element.strip()
    
    # Create a list to store the content parts
    content_parts = []
    
    # Process all child elements
    for child in element.children:
        if child.name is None:  # Text node
            text = child.strip()
            if text:
                content_parts.append(text)
        elif child.name == 'br':
            content_parts.append('\n')
        elif child.name == 'li':
            content_parts.append('\n• ' + extract_html_content(child))
        elif child.name in ['b', 'strong', 'i', 'em', 'span', 'div', 'p']:
            content_parts.append(extract_html_content(child))
        elif child.name == 'a':
            content_parts.append(child.get_text().strip())
        elif child.name == 'ul' or child.name == 'ol':
            for li in child.find_all('li', recursive=False):
                content_parts.append('\n• ' + extract_html_content(li))
    
    # Join all the parts together
    return ' '.join(content_parts).strip()

def extract_countries_from_text(content):
    """Extract country names from text content with better handling of separators."""
    countries = []
    
    # Split by common separators
    parts = re.split(r'[,\n•]|\band\b', content)
    
    for part in parts:
        country = clean_country_name(part)
        if country and country not in countries and len(country) > 3:
            countries.append(country)
    
    return countries

def split_content_by_flags(content):
    """
    Split HTML content that may contain flags and country references.
    This improves extraction when countries are arranged with flag icons.
    """
    if not content:
        return []
    
    # Check for list items
    if '•' in content:
        parts = [p.strip() for p in content.split('•') if p.strip()]
        return parts
    
    # Split by commas and newlines
    parts = re.split(r'[,\n]+', content)
    return [p.strip() for p in parts if p.strip()]

def determine_battle_result(result_text, side1_countries, side2_countries):
    """Determine the battle result with improved pattern matching."""
    if not result_text:
        return "unknown", "unknown"
    
    result_lower = result_text.lower()
    
    # Check for ties
    if any(term in result_lower for term in ["stalemate", "draw", "inconclusive", "indecisive", "truce", "ceasefire"]):
        return "tied", "tied"
    
    # Check for side 1 victory
    side1_victory = any(term in result_lower for term in ["allied victory", "american victory", "british victory", "french victory"])
    for country in side1_countries:
        if country.lower() in result_lower and "victory" in result_lower:
            side1_victory = True
            break
    
    # Check for side 2 victory
    side2_victory = any(term in result_lower for term in ["german victory", "axis victory", "japanese victory", "confederate victory"])
    for country in side2_countries:
        if country.lower() in result_lower and "victory" in result_lower:
            side2_victory = True
            break
    
    # Determine results based on the checks
    if side1_victory and not side2_victory:
        return "winner", "loser"
    elif side2_victory and not side1_victory:
        return "loser", "winner"
    
    # Check for specific defeat mentions
    if "defeat" in result_lower:
        for country in side1_countries:
            if country.lower() in result_lower and "defeat" in result_lower:
                return "loser", "winner"
        for country in side2_countries:
            if country.lower() in result_lower and "defeat" in result_lower:
                return "winner", "loser"
    
    # Default to unknown if we can't determine
    return "unknown", "unknown"

def extract_infobox_data(infobox, key):
    """
    Extract data from infobox based on a key (header text)
    with improved handling of infobox structures.
    """
    if not infobox:
        return []
    
    data = []
    found_header = False
    
    # First try to find a row with the key in the header
    for row in infobox.find_all("tr"):
        if not found_header:
            # Check if this is a header row with our key
            th = row.find("th")
            if th and key.lower() in th.get_text().strip().lower():
                found_header = True
                # Some infoboxes include data in the same row as the header
                td = row.find("td")
                if td:
                    data.append(td)
        else:
            # After finding the header, collect subsequent data rows until we hit another header
            th = row.find("th")
            if th:
                # If the new header doesn't contain our key, we've moved to a new section
                if key.lower() not in th.get_text().strip().lower():
                    break
            
            # Add any data cells
            for td in row.find_all("td"):
                data.append(td)
    
    # If standard approach failed, try to find rows with specific structure
    if not data:
        # Look for row format: belligerent1, belligerent2
        for row in infobox.find_all("tr"):
            ths = row.find_all("th")
            tds = row.find_all("td")
            
            # Check if this is a row with key in any header and multiple data cells
            if any(key.lower() in th.get_text().strip().lower() for th in ths) and len(tds) >= 2:
                data = tds
                break
    
    return data

def clean_location_text(text):
    """Clean location text by removing unnecessary parts."""
    if not isinstance(text, str):
        return ""
    
    # Remove text in brackets, parentheses
    cleaned = re.sub(r'\([^)]*\)', '', text)
    
    # Remove footnote references
    cleaned = re.sub(r'\[\d+\]', '', cleaned)
    cleaned = re.sub(r'\[citation needed\]', '', cleaned)
    
    # Remove phrases like "near", "present-day", etc.
    cleaned = re.sub(r'\bnear\b|\bpresent-day\b|\bmodern-day\b|\bvicinity of\b', '', cleaned, flags=re.IGNORECASE)
    
    # Remove excess whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # If there's a comma at the beginning after cleaning, remove it
    cleaned = re.sub(r'^,\s*', '', cleaned)
    
    return cleaned

def parse_location(text):
    """
    Parse location text to extract city and country.
    Uses several strategies to handle different formats.
    """
    if not text:
        return None, None
    
    # Common countries for reference
    common_countries = [
        "Afghanistan", "Albania", "Algeria", "Argentina", "Australia", "Austria", "Bangladesh", 
        "Belgium", "Brazil", "Bulgaria", "Canada", "China", "Croatia", "Czech Republic", 
        "Denmark", "Egypt", "Finland", "France", "Germany", "Greece", "Hungary", "India", 
        "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Japan", "Korea", 
        "Malaysia", "Mexico", "Morocco", "Netherlands", "New Zealand", "Norway", "Pakistan", 
        "Philippines", "Poland", "Portugal", "Romania", "Russia", "Saudi Arabia", "Serbia", 
        "Singapore", "South Africa", "Spain", "Sweden", "Switzerland", "Syria", "Thailand", 
        "Turkey", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", 
        "Vietnam", "Yemen", "Zimbabwe", "Soviet Union", "Ottoman Empire", "Prussia", 
        "Holy Roman Empire", "Byzantine Empire", "Roman Empire"
    ]
    
    # Strategy 1: Look for common format "City, Country"
    parts = [p.strip() for p in text.split(',')]
    
    # If we have exactly two parts, it might be "City, Country"
    if len(parts) == 2:
        # Check if the second part is a known country
        if parts[1] in common_countries or any(country in parts[1] for country in common_countries):
            return parts[0], parts[1]
    
    # Strategy 2: If we have more than 2 parts, the last part is often the country
    if len(parts) > 2:
        # Check if the last part is a known country
        if parts[-1] in common_countries or any(country in parts[-1] for country in common_countries):
            # The country is the last part
            country = parts[-1]
            
            # For the city, we need to be careful:
            # In cases like "Garibpur, Dhaka, Bangladesh", Dhaka might be the city
            # Check if the second-to-last part is a major city
            common_major_cities = ["Dhaka", "Cairo", "Tokyo", "Moscow", "Berlin", "Paris", "London", 
                                  "Beijing", "Delhi", "Rome", "Madrid", "Athens", "Istanbul", "Tehran"]
            
            if parts[-2] in common_major_cities:
                city = parts[-2]
            else:
                # Otherwise use the first part as the city
                city = parts[0]
            
            return city, country
    
    # Strategy 3: Check if any part contains a known country
    country = None
    for part in reversed(parts):  # Check from the end as country usually comes last
        if any(country_name in part for country_name in common_countries):
            country = part
            break
    
    if country:
        # If we found a country, assume the first part is the city
        city = parts[0]
        return city, country
    
    # Strategy 4: If only one part and no country identified, it might be just a region or landmark
    if len(parts) == 1:
        return parts[0], None
    
    # Strategy 5: If we have multiple parts but couldn't identify a country, use first as city, last as potential region
    if len(parts) > 1:
        return parts[0], parts[-1]
    
    # Default fallback: return None for both
    return None, None

def extract_location_data(infobox):
    """
    Extract location data from infobox with handling for various formats.
    Returns tuple of (city, country)
    """
    if not infobox:
        return None, None
    
    # Find location data in the infobox
    location_data = extract_infobox_data(infobox, "place") or extract_infobox_data(infobox, "location")
    
    if not location_data:
        return None, None
    
    # Get the text content of the first location element
    location_text = extract_html_content(location_data[0])
    if not location_text:
        return None, None
    
    print(f"Raw location text: {location_text}")
    
    # Clean the location text
    location_text = clean_location_text(location_text)
    
    # Extract city and country
    city, country = parse_location(location_text)
    
    return city, country

def scrape_battle_page(battle_id, battle_name, wiki_link):
    """Scrape an individual battle page for country participant information."""
    print(f"Scraping battle page: {battle_name} (ID: {battle_id})")
    
    # Check if the wiki_link is already a full URL
    if not wiki_link.startswith("http"):
        url = urljoin(BASE_URL, wiki_link)
    else:
        url = wiki_link
    
    participants = []
    
    try:
        # Fetch the page
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to fetch page: {url} (Status: {response.status_code})")
            return participants
        
        # Parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find the infobox (try multiple classes)
        infobox = soup.find("table", {"class": ["infobox", "vevent", "vcard"]})
        if not infobox:
            print(f"No infobox found for {battle_name} at {url}")
            return participants
        
        # Extract location information
        city, country = extract_location_data(infobox)
        print(f"Battle location - City: {city}, Country: {country}")
        
        # Extract result first to determine overall battle outcome
        result_text = ""
        result_data = extract_infobox_data(infobox, "result")
        if result_data:
            result_text = extract_html_content(result_data[0])
            print(f"Result: {result_text[:100]}...")
        
        # Extract belligerents
        belligerent_data = extract_infobox_data(infobox, "belligerent") or extract_infobox_data(infobox, "combatant")
        
        side1_countries = []
        side2_countries = []
        
        if len(belligerent_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(belligerent_data[0])
            print(f"Side 1 raw content: {side1_content[:100]}...")
            
            # Extract countries with improved extraction
            if side1_content:
                # First try to extract from list items if present
                parts = split_content_by_flags(side1_content)
                if parts:
                    for part in parts:
                        country_name = clean_country_name(part)
                        if country_name and country_name not in side1_countries:
                            side1_countries.append(country_name)
                else:
                    # Try extracting from the full content
                    extracted = extract_countries_from_text(side1_content)
                    side1_countries.extend([c for c in extracted if c not in side1_countries])
            
            # Process Side 2
            side2_content = extract_html_content(belligerent_data[1])
            print(f"Side 2 raw content: {side2_content[:100]}...")
            
            # Extract countries with improved extraction
            if side2_content:
                # First try to extract from list items if present
                parts = split_content_by_flags(side2_content)
                if parts:
                    for part in parts:
                        country_name = clean_country_name(part)
                        if country_name and country_name not in side2_countries:
                            side2_countries.append(country_name)
                else:
                    # Try extracting from the full content
                    extracted = extract_countries_from_text(side2_content)
                    side2_countries.extend([c for c in extracted if c not in side2_countries])
        
        # If no countries found, try harder with alternative approaches
        if not side1_countries and not side2_countries:
            # Get all text from the infobox
            infobox_text = infobox.get_text()
            
            # Try to identify common country names in the text
            common_countries = [
                "United States", "Germany", "France", "United Kingdom", "Great Britain",
                "Soviet Union", "Russia", "Japan", "Italy", "China", "Spain", "Austria",
                "Confederate States", "Union", "Prussia", "Ottoman Empire", "Turkey"
            ]
            
            found_countries = []
            for country in common_countries:
                if country in infobox_text:
                    found_countries.append(country)
            
            # If we found some countries, try to split them into sides
            if len(found_countries) >= 2:
                # Common historical pairings to guess sides
                opposing_pairs = [
                    (["United States", "United Kingdom", "France", "Russia", "Soviet Union"], 
                     ["Germany", "Japan", "Italy"]),
                    (["Union"], ["Confederate States"]),
                    (["United Kingdom", "Prussia", "Russia"], ["France"]),
                    (["United Kingdom", "France"], ["Spain"]),
                    (["Austria", "Prussia", "Russia"], ["Ottoman Empire", "Turkey"])
                ]
                
                for side1_candidates, side2_candidates in opposing_pairs:
                    side1_matches = [c for c in found_countries if c in side1_candidates]
                    side2_matches = [c for c in found_countries if c in side2_candidates]
                    
                    if side1_matches and side2_matches:
                        side1_countries = side1_matches
                        side2_countries = side2_matches
                        break
                
                # If we couldn't assign sides with pairs, just split the found countries
                if not side1_countries and not side2_countries and len(found_countries) >= 2:
                    middle = len(found_countries) // 2
                    side1_countries = found_countries[:middle]
                    side2_countries = found_countries[middle:]
        
        print(f"Side 1 countries: {side1_countries}")
        print(f"Side 2 countries: {side2_countries}")
        
        # Extract strength information
        strength_data = extract_infobox_data(infobox, "strength")
        
        side1_troops = None
        side2_troops = None
        
        if len(strength_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(strength_data[0])
            print(f"Side 1 strength raw: {side1_content[:100]}...")
            
            # Extract troop numbers with improved extraction
            if side1_content:
                # Try multiple keywords for troop counts
                keywords = ['men', 'troops', 'soldiers', 'personnel', 'combatants', 'infantry', 'army']
                side1_troops = extract_number(side1_content, keywords)
            
            # Process Side 2
            side2_content = extract_html_content(strength_data[1])
            print(f"Side 2 strength raw: {side2_content[:100]}...")
            
            # Extract troop numbers with improved extraction
            if side2_content:
                # Try multiple keywords for troop counts
                keywords = ['men', 'troops', 'soldiers', 'personnel', 'combatants', 'infantry', 'army']
                side2_troops = extract_number(side2_content, keywords)
        
        print(f"Side 1 troops: {side1_troops}")
        print(f"Side 2 troops: {side2_troops}")
        
        # Extract casualty information
        casualty_data = extract_infobox_data(infobox, "casualt") or extract_infobox_data(infobox, "loss")
        
        side1_deaths = None
        side2_deaths = None
        
        if len(casualty_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(casualty_data[0])
            print(f"Side 1 casualties raw: {side1_content[:100]}...")
            
            # Extract death numbers with improved extraction
            if side1_content:
                # Try multiple keywords for death counts
                keywords = ['killed', 'dead', 'deaths', 'fatalities', 'kia', 'killed in action']
                side1_deaths = extract_number(side1_content, keywords)
                
                # If no deaths found with keywords, try extracting other casualty numbers
                if side1_deaths is None:
                    side1_deaths = extract_number(side1_content, ['casualties', 'wounded', 'injured'])
            
            # Process Side 2
            side2_content = extract_html_content(casualty_data[1])
            print(f"Side 2 casualties raw: {side2_content[:100]}...")
            
            # Extract death numbers with improved extraction
            if side2_content:
                # Try multiple keywords for death counts
                keywords = ['killed', 'dead', 'deaths', 'fatalities', 'kia', 'killed in action']
                side2_deaths = extract_number(side2_content, keywords)
                
                # If no deaths found with keywords, try extracting other casualty numbers
                if side2_deaths is None:
                    side2_deaths = extract_number(side2_content, ['casualties', 'wounded', 'injured'])
        
        print(f"Side 1 deaths: {side1_deaths}")
        print(f"Side 2 deaths: {side2_deaths}")
        
        # Determine the result for each side based on the overall result
        side1_result, side2_result = determine_battle_result(result_text, side1_countries, side2_countries)
        
        print(f"Side 1 result: {side1_result}")
        print(f"Side 2 result: {side2_result}")
        
        # Add the participants with the new location columns
        for country_name in side1_countries:
            participants.append({
                "battle_id": battle_id,
                "country": country_name,
                "troops": side1_troops,
                "deaths": side1_deaths,
                "result": side1_result,
                "city_battle_location": city,
                "country_battle_location": country
            })
        
        for country_name in side2_countries:
            participants.append({
                "battle_id": battle_id,
                "country": country_name,
                "troops": side2_troops,
                "deaths": side2_deaths,
                "result": side2_result,
                "city_battle_location": city,
                "country_battle_location": country
            })
        
        print(f"Found {len(participants)} participants for {battle_name}")
        
        return participants
    
    except Exception as e:
        print(f"Error scraping {battle_name}: {e}")
        import traceback
        traceback.print_exc()
        return participants

def main():
    try:
        # Load the battles from the CSV file
        battles_df = load_battles_csv()
        
        # Create a list to store all participants
        all_participants = []
        
        # Process all battles or limit for testing
        max_battles = len(battles_df)  # Change this to limit the number of battles processed
        num_battles = min(len(battles_df), max_battles)
        
        print(f"Processing {num_battles} battles...")
        
        # Process each battle
        for index, battle in battles_df.head(num_battles).iterrows():
            battle_id = battle["battle_id"]
            battle_name = battle["battle_name"]
            wiki_link = battle["wiki_link"]
            
            # Skip if missing wiki link
            if pd.isna(wiki_link) or not wiki_link:
                print(f"Skipping battle {battle_name} (ID: {battle_id}) - Missing wiki link")
                continue
            
            # Scrape the battle page for country information
            participants = scrape_battle_page(battle_id, battle_name, wiki_link)
            
            # Add participants to the overall list
            all_participants.extend(participants)
            
            # Add a random delay to avoid rate limiting
            time.sleep(random.uniform(1.0, 3.0))
            
            # Print progress every 10 battles
            if (index + 1) % 10 == 0 or (index + 1) == num_battles:
                print(f"Progress: {index + 1}/{num_battles} battles processed ({((index + 1)/num_battles)*100:.1f}%)")
        
        # Create a DataFrame from the participants data
        if not all_participants:
            print("No participants found. Check the scraping logic.")
            return
        
        participants_df = pd.DataFrame(all_participants)
        
        # Add a unique participant_id as the primary key
        participants_df.reset_index(drop=True, inplace=True)
        participants_df['participant_id'] = participants_df.index + 1  # Start IDs from 1
        
        # Reorder columns to put participant_id first
        cols = ['participant_id', 'battle_id', 'country', 'troops', 'deaths', 'result', 
                'city_battle_location', 'country_battle_location']
        participants_df = participants_df[cols]
        
        # Save to CSV
        participants_df

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
import os.path
from urllib.parse import urljoin

# Constants
BASE_URL = "https://en.wikipedia.org"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    )
}

def load_battles_csv(filename='/Users/louis/Desktop/Coding/Github/WarCast/Data/Pre_clean/battles_list.csv'):
    """Load the battles CSV file and return as DataFrame."""
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Battles CSV file not found: {filename}")
    
    battles_df = pd.read_csv(filename)
    print(f"Loaded {len(battles_df)} battles from {filename}")
    return battles_df

def extract_number(text, keywords=None):
    """
    Extract number from text, with optional keyword context.
    If keywords provided, looks for numbers near the keywords.
    """
    if not isinstance(text, str):
        return None
    
    text = text.replace('\xa0', ' ')  # Handle non-breaking spaces
    
    # Default keywords if none provided
    if keywords is None:
        keywords = ['men', 'troops', 'soldiers', 'personnel']
    elif isinstance(keywords, str):
        keywords = [keywords]
    
    # Try each keyword
    for keyword in keywords:
        if keyword in text.lower():
            # Find position of keyword
            pos = text.lower().find(keyword)
            
            # Look for numbers before the keyword (within 50 characters)
            before_text = text[max(0, pos-50):pos]
            number_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?|\d+)', before_text)
            
            # If found numbers before keyword, use the closest one (last one)
            if number_matches:
                num_str = number_matches[-1].replace(',', '')
                try:
                    return int(float(num_str))
                except ValueError:
                    pass
            
            # Look for numbers after keyword (within 20 characters)
            after_text = text[pos + len(keyword):min(len(text), pos + len(keyword) + 20)]
            number_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?|\d+)', after_text)
            
            # If found numbers after keyword, use the closest one (first one)
            if number_matches:
                num_str = number_matches[0].replace(',', '')
                try:
                    return int(float(num_str))
                except ValueError:
                    pass
    
    # If no numbers found near keywords, look for number ranges (e.g., "10,000-15,000")
    range_matches = re.findall(r'(\d{1,3}(?:,\d{3})+|\d+)(?:\s*[-–]\s*)(\d{1,3}(?:,\d{3})+|\d+)', text)
    if range_matches:
        try:
            # Use average of the range
            first_num = int(range_matches[0][0].replace(',', ''))
            second_num = int(range_matches[0][1].replace(',', ''))
            return (first_num + second_num) // 2
        except ValueError:
            pass
    
    # Look for any number with specific formats (prioritize larger numbers which are more likely troop counts)
    # Focus on numbers that appear to be formatted as thousands (e.g., "10,000")
    formatted_thousands = re.findall(r'(\d{1,3},\d{3})', text)
    if formatted_thousands:
        try:
            return int(formatted_thousands[0].replace(',', ''))
        except ValueError:
            pass
    
    # Look for any large numbers (3+ digits)
    large_numbers = re.findall(r'(\d{3,})', text)
    if large_numbers:
        try:
            return int(large_numbers[0])
        except ValueError:
            pass
    
    # Last resort: any number
    number_matches = re.findall(r'(\d+)', text)
    if number_matches:
        for match in number_matches:
            try:
                num = int(match)
                # Only consider if it's a reasonably sized number for troop counts
                if num > 50:  # Avoid small numbers that might be dates or footnotes
                    return num
            except ValueError:
                continue
    
    return None

def clean_country_name(text):
    """Clean country name text."""
    if not isinstance(text, str):
        return None
    
    # Remove text in brackets, parentheses
    cleaned = re.sub(r'\([^)]*\)', '', text)
    cleaned = re.sub(r'\[[^\]]*\]', '', cleaned)
    
    # Remove footnote references and citation needed
    cleaned = re.sub(r'\[\d+\]', '', cleaned)
    cleaned = re.sub(r'\[citation needed\]', '', cleaned)
    
    # Remove excess whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # Skip phrases that are clearly results, not countries
    result_phrases = [
        'victory', 'defeat', 'decisive', 'inconclusive', 'stalemate', 
        'draw', 'tactical', 'strategic', 'pyrrhic', 'truce', 'ceasefire'
    ]
    if any(phrase in cleaned.lower() for phrase in result_phrases) and len(cleaned.split()) <= 3:
        return None
    
    # Remove very short strings or just numbers or dates
    if len(cleaned) < 3 or cleaned.isdigit() or re.match(r'^\d+\s+\w+$', cleaned):
        return None
    
    # Replace some common abbreviations
    replacements = {
        'US': 'United States',
        'U.S.': 'United States',
        'U.S.A.': 'United States',
        'USA': 'United States',
        'UK': 'United Kingdom',
        'U.K.': 'United Kingdom',
        'Soviet Union': 'Russia',
        'USSR': 'Russia',
        'U.S.S.R.': 'Russia'
    }
    
    for abbr, full in replacements.items():
        if cleaned == abbr:
            cleaned = full
    
    return cleaned

def extract_html_content(element):
    """Extract a clean text representation of HTML content including flags and formatting."""
    if element is None:
        return ""
    
    # For simple text elements, just return the text
    if isinstance(element, str):
        return element.strip()
    
    # Create a list to store the content parts
    content_parts = []
    
    # Process all child elements
    for child in element.children:
        if child.name is None:  # Text node
            text = child.strip()
            if text:
                content_parts.append(text)
        elif child.name == 'br':
            content_parts.append('\n')
        elif child.name == 'li':
            content_parts.append('\n• ' + extract_html_content(child))
        elif child.name in ['b', 'strong', 'i', 'em', 'span', 'div', 'p']:
            content_parts.append(extract_html_content(child))
        elif child.name == 'a':
            content_parts.append(child.get_text().strip())
        elif child.name == 'ul' or child.name == 'ol':
            for li in child.find_all('li', recursive=False):
                content_parts.append('\n• ' + extract_html_content(li))
    
    # Join all the parts together
    return ' '.join(content_parts).strip()

def extract_countries_from_text(content):
    """Extract country names from text content with better handling of separators."""
    countries = []
    
    # Split by common separators
    parts = re.split(r'[,\n•]|\band\b', content)
    
    for part in parts:
        country = clean_country_name(part)
        if country and country not in countries and len(country) > 3:
            countries.append(country)
    
    return countries

def split_content_by_flags(content):
    """
    Split HTML content that may contain flags and country references.
    This improves extraction when countries are arranged with flag icons.
    """
    if not content:
        return []
    
    # Check for list items
    if '•' in content:
        parts = [p.strip() for p in content.split('•') if p.strip()]
        return parts
    
    # Split by commas and newlines
    parts = re.split(r'[,\n]+', content)
    return [p.strip() for p in parts if p.strip()]

def determine_battle_result(result_text, side1_countries, side2_countries):
    """Determine the battle result with improved pattern matching."""
    if not result_text:
        return "unknown", "unknown"
    
    result_lower = result_text.lower()
    
    # Check for ties
    if any(term in result_lower for term in ["stalemate", "draw", "inconclusive", "indecisive", "truce", "ceasefire"]):
        return "tied", "tied"
    
    # Check for side 1 victory
    side1_victory = any(term in result_lower for term in ["allied victory", "american victory", "british victory", "french victory"])
    for country in side1_countries:
        if country.lower() in result_lower and "victory" in result_lower:
            side1_victory = True
            break
    
    # Check for side 2 victory
    side2_victory = any(term in result_lower for term in ["german victory", "axis victory", "japanese victory", "confederate victory"])
    for country in side2_countries:
        if country.lower() in result_lower and "victory" in result_lower:
            side2_victory = True
            break
    
    # Determine results based on the checks
    if side1_victory and not side2_victory:
        return "winner", "loser"
    elif side2_victory and not side1_victory:
        return "loser", "winner"
    
    # Check for specific defeat mentions
    if "defeat" in result_lower:
        for country in side1_countries:
            if country.lower() in result_lower and "defeat" in result_lower:
                return "loser", "winner"
        for country in side2_countries:
            if country.lower() in result_lower and "defeat" in result_lower:
                return "winner", "loser"
    
    # Default to unknown if we can't determine
    return "unknown", "unknown"

def extract_infobox_data(infobox, key):
    """
    Extract data from infobox based on a key (header text)
    with improved handling of infobox structures.
    """
    if not infobox:
        return []
    
    data = []
    found_header = False
    
    # First try to find a row with the key in the header
    for row in infobox.find_all("tr"):
        if not found_header:
            # Check if this is a header row with our key
            th = row.find("th")
            if th and key.lower() in th.get_text().strip().lower():
                found_header = True
                # Some infoboxes include data in the same row as the header
                td = row.find("td")
                if td:
                    data.append(td)
        else:
            # After finding the header, collect subsequent data rows until we hit another header
            th = row.find("th")
            if th:
                # If the new header doesn't contain our key, we've moved to a new section
                if key.lower() not in th.get_text().strip().lower():
                    break
            
            # Add any data cells
            for td in row.find_all("td"):
                data.append(td)
    
    # If standard approach failed, try to find rows with specific structure
    if not data:
        # Look for row format: belligerent1, belligerent2
        for row in infobox.find_all("tr"):
            ths = row.find_all("th")
            tds = row.find_all("td")
            
            # Check if this is a row with key in any header and multiple data cells
            if any(key.lower() in th.get_text().strip().lower() for th in ths) and len(tds) >= 2:
                data = tds
                break
    
    return data

def clean_location_text(text):
    """Clean location text by removing unnecessary parts."""
    if not isinstance(text, str):
        return ""
    
    # Remove text in brackets, parentheses
    cleaned = re.sub(r'\([^)]*\)', '', text)
    
    # Remove footnote references
    cleaned = re.sub(r'\[\d+\]', '', cleaned)
    cleaned = re.sub(r'\[citation needed\]', '', cleaned)
    
    # Remove phrases like "near", "present-day", etc.
    cleaned = re.sub(r'\bnear\b|\bpresent-day\b|\bmodern-day\b|\bvicinity of\b', '', cleaned, flags=re.IGNORECASE)
    
    # Remove excess whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # If there's a comma at the beginning after cleaning, remove it
    cleaned = re.sub(r'^,\s*', '', cleaned)
    
    return cleaned

def parse_location(text):
    """
    Parse location text to extract city and country.
    Uses several strategies to handle different formats.
    """
    if not text:
        return None, None
    
    # Common countries for reference
    common_countries = [
        "Afghanistan", "Albania", "Algeria", "Argentina", "Australia", "Austria", "Bangladesh", 
        "Belgium", "Brazil", "Bulgaria", "Canada", "China", "Croatia", "Czech Republic", 
        "Denmark", "Egypt", "Finland", "France", "Germany", "Greece", "Hungary", "India", 
        "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Japan", "Korea", 
        "Malaysia", "Mexico", "Morocco", "Netherlands", "New Zealand", "Norway", "Pakistan", 
        "Philippines", "Poland", "Portugal", "Romania", "Russia", "Saudi Arabia", "Serbia", 
        "Singapore", "South Africa", "Spain", "Sweden", "Switzerland", "Syria", "Thailand", 
        "Turkey", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", 
        "Vietnam", "Yemen", "Zimbabwe", "Soviet Union", "Ottoman Empire", "Prussia", 
        "Holy Roman Empire", "Byzantine Empire", "Roman Empire"
    ]
    
    # Strategy 1: Look for common format "City, Country"
    parts = [p.strip() for p in text.split(',')]
    
    # If we have exactly two parts, it might be "City, Country"
    if len(parts) == 2:
        # Check if the second part is a known country
        if parts[1] in common_countries or any(country in parts[1] for country in common_countries):
            return parts[0], parts[1]
    
    # Strategy 2: If we have more than 2 parts, the last part is often the country
    if len(parts) > 2:
        # Check if the last part is a known country
        if parts[-1] in common_countries or any(country in parts[-1] for country in common_countries):
            # The country is the last part
            country = parts[-1]
            
            # For the city, we need to be careful:
            # In cases like "Garibpur, Dhaka, Bangladesh", Dhaka might be the city
            # Check if the second-to-last part is a major city
            common_major_cities = ["Dhaka", "Cairo", "Tokyo", "Moscow", "Berlin", "Paris", "London", 
                                  "Beijing", "Delhi", "Rome", "Madrid", "Athens", "Istanbul", "Tehran"]
            
            if parts[-2] in common_major_cities:
                city = parts[-2]
            else:
                # Otherwise use the first part as the city
                city = parts[0]
            
            return city, country
    
    # Strategy 3: Check if any part contains a known country
    country = None
    for part in reversed(parts):  # Check from the end as country usually comes last
        if any(country_name in part for country_name in common_countries):
            country = part
            break
    
    if country:
        # If we found a country, assume the first part is the city
        city = parts[0]
        return city, country
    
    # Strategy 4: If only one part and no country identified, it might be just a region or landmark
    if len(parts) == 1:
        return parts[0], None
    
    # Strategy 5: If we have multiple parts but couldn't identify a country, use first as city, last as potential region
    if len(parts) > 1:
        return parts[0], parts[-1]
    
    # Default fallback: return None for both
    return None, None

def extract_location_data(infobox):
    """
    Extract location data from infobox with handling for various formats.
    Returns tuple of (city, country)
    """
    if not infobox:
        return None, None
    
    # Find location data in the infobox
    location_data = extract_infobox_data(infobox, "place") or extract_infobox_data(infobox, "location")
    
    if not location_data:
        return None, None
    
    # Get the text content of the first location element
    location_text = extract_html_content(location_data[0])
    if not location_text:
        return None, None
    
    print(f"Raw location text: {location_text}")
    
    # Clean the location text
    location_text = clean_location_text(location_text)
    
    # Extract city and country
    city, country = parse_location(location_text)
    
    return city, country

def scrape_battle_page(battle_id, battle_name, wiki_link):
    """Scrape an individual battle page for country participant information."""
    print(f"Scraping battle page: {battle_name} (ID: {battle_id})")
    
    # Check if the wiki_link is already a full URL
    if not wiki_link.startswith("http"):
        url = urljoin(BASE_URL, wiki_link)
    else:
        url = wiki_link
    
    participants = []
    
    try:
        # Fetch the page
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to fetch page: {url} (Status: {response.status_code})")
            return participants
        
        # Parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find the infobox (try multiple classes)
        infobox = soup.find("table", {"class": ["infobox", "vevent", "vcard"]})
        if not infobox:
            print(f"No infobox found for {battle_name} at {url}")
            return participants
        
        # Extract location information
        city, country = extract_location_data(infobox)
        print(f"Battle location - City: {city}, Country: {country}")
        
        # Extract result first to determine overall battle outcome
        result_text = ""
        result_data = extract_infobox_data(infobox, "result")
        if result_data:
            result_text = extract_html_content(result_data[0])
            print(f"Result: {result_text[:100]}...")
        
        # Extract belligerents
        belligerent_data = extract_infobox_data(infobox, "belligerent") or extract_infobox_data(infobox, "combatant")
        
        side1_countries = []
        side2_countries = []
        
        if len(belligerent_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(belligerent_data[0])
            print(f"Side 1 raw content: {side1_content[:100]}...")
            
            # Extract countries with improved extraction
            if side1_content:
                # First try to extract from list items if present
                parts = split_content_by_flags(side1_content)
                if parts:
                    for part in parts:
                        country_name = clean_country_name(part)
                        if country_name and country_name not in side1_countries:
                            side1_countries.append(country_name)
                else:
                    # Try extracting from the full content
                    extracted = extract_countries_from_text(side1_content)
                    side1_countries.extend([c for c in extracted if c not in side1_countries])
            
            # Process Side 2
            side2_content = extract_html_content(belligerent_data[1])
            print(f"Side 2 raw content: {side2_content[:100]}...")
            
            # Extract countries with improved extraction
            if side2_content:
                # First try to extract from list items if present
                parts = split_content_by_flags(side2_content)
                if parts:
                    for part in parts:
                        country_name = clean_country_name(part)
                        if country_name and country_name not in side2_countries:
                            side2_countries.append(country_name)
                else:
                    # Try extracting from the full content
                    extracted = extract_countries_from_text(side2_content)
                    side2_countries.extend([c for c in extracted if c not in side2_countries])
        
        # If no countries found, try harder with alternative approaches
        if not side1_countries and not side2_countries:
            # Get all text from the infobox
            infobox_text = infobox.get_text()
            
            # Try to identify common country names in the text
            common_countries = [
                "United States", "Germany", "France", "United Kingdom", "Great Britain",
                "Soviet Union", "Russia", "Japan", "Italy", "China", "Spain", "Austria",
                "Confederate States", "Union", "Prussia", "Ottoman Empire", "Turkey"
            ]
            
            found_countries = []
            for country in common_countries:
                if country in infobox_text:
                    found_countries.append(country)
            
            # If we found some countries, try to split them into sides
            if len(found_countries) >= 2:
                # Common historical pairings to guess sides
                opposing_pairs = [
                    (["United States", "United Kingdom", "France", "Russia", "Soviet Union"], 
                     ["Germany", "Japan", "Italy"]),
                    (["Union"], ["Confederate States"]),
                    (["United Kingdom", "Prussia", "Russia"], ["France"]),
                    (["United Kingdom", "France"], ["Spain"]),
                    (["Austria", "Prussia", "Russia"], ["Ottoman Empire", "Turkey"])
                ]
                
                for side1_candidates, side2_candidates in opposing_pairs:
                    side1_matches = [c for c in found_countries if c in side1_candidates]
                    side2_matches = [c for c in found_countries if c in side2_candidates]
                    
                    if side1_matches and side2_matches:
                        side1_countries = side1_matches
                        side2_countries = side2_matches
                        break
                
                # If we couldn't assign sides with pairs, just split the found countries
                if not side1_countries and not side2_countries and len(found_countries) >= 2:
                    middle = len(found_countries) // 2
                    side1_countries = found_countries[:middle]
                    side2_countries = found_countries[middle:]
        
        print(f"Side 1 countries: {side1_countries}")
        print(f"Side 2 countries: {side2_countries}")
        
        # Extract strength information
        strength_data = extract_infobox_data(infobox, "strength")
        
        side1_troops = None
        side2_troops = None
        
        if len(strength_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(strength_data[0])
            print(f"Side 1 strength raw: {side1_content[:100]}...")
            
            # Extract troop numbers with improved extraction
            if side1_content:
                # Try multiple keywords for troop counts
                keywords = ['men', 'troops', 'soldiers', 'personnel', 'combatants', 'infantry', 'army']
                side1_troops = extract_number(side1_content, keywords)
            
            # Process Side 2
            side2_content = extract_html_content(strength_data[1])
            print(f"Side 2 strength raw: {side2_content[:100]}...")
            
            # Extract troop numbers with improved extraction
            if side2_content:
                # Try multiple keywords for troop counts
                keywords = ['men', 'troops', 'soldiers', 'personnel', 'combatants', 'infantry', 'army']
                side2_troops = extract_number(side2_content, keywords)
        
        print(f"Side 1 troops: {side1_troops}")
        print(f"Side 2 troops: {side2_troops}")
        
        # Extract casualty information
        casualty_data = extract_infobox_data(infobox, "casualt") or extract_infobox_data(infobox, "loss")
        
        side1_deaths = None
        side2_deaths = None
        
        if len(casualty_data) >= 2:
            # Most common case: side-by-side layout with two columns
            
            # Process Side 1
            side1_content = extract_html_content(casualty_data[0])
            print(f"Side 1 casualties raw: {side1_content[:100]}...")
            
            # Extract death numbers with improved extraction
            if side1_content:
                # Try multiple keywords for death counts
                keywords = ['killed', 'dead', 'deaths', 'fatalities', 'kia', 'killed in action']
                side1_deaths = extract_number(side1_content, keywords)
                
                # If no deaths found with keywords, try extracting other casualty numbers
                if side1_deaths is None:
                    side1_deaths = extract_number(side1_content, ['casualties', 'wounded', 'injured'])
            
            # Process Side 2
            side2_content = extract_html_content(casualty_data[1])
            print(f"Side 2 casualties raw: {side2_content[:100]}...")
            
            # Extract death numbers with improved extraction
            if side2_content:
                # Try multiple keywords for death counts
                keywords = ['killed', 'dead', 'deaths', 'fatalities', 'kia', 'killed in action']
                side2_deaths = extract_number(side2_content, keywords)
                
                # If no deaths found with keywords, try extracting other casualty numbers
                if side2_deaths is None:
                    side2_deaths = extract_number(side2_content, ['casualties', 'wounded', 'injured'])
        
        print(f"Side 1 deaths: {side1_deaths}")
        print(f"Side 2 deaths: {side2_deaths}")
        
        # Determine the result for each side based on the overall result
        side1_result, side2_result = determine_battle_result(result_text, side1_countries, side2_countries)
        
        print(f"Side 1 result: {side1_result}")
        print(f"Side 2 result: {side2_result}")
        
        # Add the participants with the new location columns
        for country_name in side1_countries:
            participants.append({
                "battle_id": battle_id,
                "country": country_name,
                "troops": side1_troops,
                "deaths": side1_deaths,
                "result": side1_result,
                "city_battle_location": city,
                "country_battle_location": country
            })
        
        for country_name in side2_countries:
            participants.append({
                "battle_id": battle_id,
                "country": country_name,
                "troops": side2_troops,
                "deaths": side2_deaths,
                "result": side2_result,
                "city_battle_location": city,
                "country_battle_location": country
            })
        
        print(f"Found {len(participants)} participants for {battle_name}")
        
        return participants
    
    except Exception as e:
        print(f"Error scraping {battle_name}: {e}")
        import traceback
        traceback.print_exc()
        return participants

def main():
    try:
        # Load the battles from the CSV file
        battles_df = load_battles_csv()
        
        # Create a list to store all participants
        all_participants = []
        
        # Process all battles or limit for testing
        max_battles = len(battles_df)  # Change this to limit the number of battles processed
        num_battles = min(len(battles_df), max_battles)
        
        print(f"Processing {num_battles} battles...")
        
        # Process each battle
        for index, battle in battles_df.head(num_battles).iterrows():
            battle_id = battle["battle_id"]
            battle_name = battle["battle_name"]
            wiki_link = battle["wiki_link"]
            
            # Skip if missing wiki link
            if pd.isna(wiki_link) or not wiki_link:
                print(f"Skipping battle {battle_name} (ID: {battle_id}) - Missing wiki link")
                continue
            
            # Scrape the battle page for country information
            participants = scrape_battle_page(battle_id, battle_name, wiki_link)
            
            # Add participants to the overall list
            all_participants.extend(participants)
            
            # Add a random delay to avoid rate limiting
            time.sleep(random.uniform(1.0, 3.0))
            
            # Print progress every 10 battles
            if (index + 1) % 10 == 0 or (index + 1) == num_battles:
                print(f"Progress: {index + 1}/{num_battles} battles processed ({((index + 1)/num_battles)*100:.1f}%)")
        
        # Create a DataFrame from the participants data
        if not all_participants:
            print("No participants found. Check the scraping logic.")
            return
        
        participants_df = pd.DataFrame(all_participants)
        
        # Add a unique participant_id as the primary key
        participants_df.reset_index(drop=True, inplace=True)
        participants_df['participant_id'] = participants_df.index + 1  # Start IDs from 1
        
        # Reorder columns to put participant_id first
        cols = ['participant_id', 'battle_id', 'country', 'troops', 'deaths', 'result', 
                'city_battle_location', 'country_battle_location']
        participants_df = participants_df[cols]
        
        # Save to CSV
        participants_df.to_csv("battle_info.csv", index=False)
        
        print(f"\n✅ Scraped {len(participants_df)} participant entries for {num_battles} battles.")
        print("Saved to 'battle_info.csv'")
        
        # Print stats about data extraction
        troops_count = participants_df['troops'].notna().sum()
        deaths_count = participants_df['deaths'].notna().sum()
        result_count = participants_df[participants_df['result'] != "unknown"].shape[0]
        city_count = participants_df['city_battle_location'].notna().sum()
        country_count = participants_df['country_battle_location'].notna().sum()
        
        print(f"\nExtraction statistics:")
        print(f"- Participants with troop numbers: {troops_count} ({troops_count/len(participants_df)*100:.1f}%)")
        print(f"- Participants with death counts: {deaths_count} ({deaths_count/len(participants_df)*100:.1f}%)")
        print(f"- Participants with known results: {result_count} ({result_count/len(participants_df)*100:.1f}%)")
        print(f"- Battles with city location: {city_count} ({city_count/len(participants_df)*100:.1f}%)")
        print(f"- Battles with country location: {country_count} ({country_count/len(participants_df)*100:.1f}%)")
        
        # Show some sample data
        print("\nSample of participant data:")
        for i, row in participants_df.head(5).iterrows():
            print(f"{i+1}. Participant ID: {row['participant_id']}, Battle ID: {row['battle_id']}, Country: {row['country']}")
            print(f"   Troops: {row['troops'] if pd.notna(row['troops']) else 'Unknown'}")
            print(f"   Deaths: {row['deaths'] if pd.notna(row['deaths']) else 'Unknown'}")
            print(f"   Result: {row['result'] if pd.notna(row['result']) else 'Unknown'}")
            print(f"   Battle Location: {row['city_battle_location'] if pd.notna(row['city_battle_location']) else 'Unknown'}, "
                  f"{row['country_battle_location'] if pd.notna(row['country_battle_location']) else 'Unknown'}")
    
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Loaded 962 battles from /Users/louis/Desktop/Coding/Github/WarCast/Data/Pre_clean/battles_list.csv
Processing 962 battles...
Scraping battle page: Battle of Aachen (ID: 1)
Raw location text: Aachen , Germany 50°46′35″N 06°05′00″E﻿ / ﻿50.77639°N 6.08333°E﻿ / 50.77639; 6.08333
Battle location - City: Aachen, Country: Germany 50°46′35″N 06°05′00″E﻿ / ﻿50.77639°N 6.08333°E﻿ / 50.77639; 6.08333
Result: American victory...
Side 1 raw content: United States...
Side 2 raw content: Germany...
Side 1 countries: ['United States']
Side 2 countries: ['Germany']
Side 1 strength raw: 100,000 soldiers...
Side 2 strength raw: 13,000 soldiers 
 5,000 Volkssturm...
Side 1 troops: 100000
Side 2 troops: 13000
Side 1 casualties raw: 7,000+ casualties 
 including 2,000 killed...
Side 2 casualties raw: 5,000 killed (estimate), 
 5,600 captured...
Side 1 deaths: 2000
Side 2 deaths: 5000
Side 1 result: winner
Side 2 result: loser
Found 2 participants for Battle of Aachen
Scraping battle page: Battle of Abensber