In [None]:
# Task A - Table Scraping Implementation
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

def scrape_wikipedia_table(url, table_index=0):
    """
    Scrape a table from Wikipedia and return as DataFrame
    """
    # Get the webpage
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    
    # Parse HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all tables
    tables = soup.find_all('table', class_='wikitable')
    
    if not tables:
        raise ValueError("No wikitable found on the page")
    
    # BUG 1: Off-by-one error - should check if table_index is within bounds
    table = tables[table_index + 1]  # This will cause IndexError for single table pages
    
    # Extract headers
    headers = []
    header_row = table.find('tr')
    if header_row:
        for th in header_row.find_all(['th', 'td']):
            # Clean header text
            header_text = th.get_text().strip()
            # Remove footnote markers
            header_text = re.sub(r'\[.*?\]', '', header_text)
            headers.append(header_text)
    
    # Extract rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip header row
        row = []
        for td in tr.find_all(['td', 'th']):
            # Clean cell text
            cell_text = td.get_text().strip()
            # Remove footnote markers and extra whitespace
            cell_text = re.sub(r'\[.*?\]', '', cell_text)
            cell_text = ' '.join(cell_text.split())
            row.append(cell_text)
        
        # BUG 2: Not handling rows with different column counts
        if row:  # Should check if len(row) == len(headers)
            rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    return df

def clean_and_normalize_data(df):
    """
    Clean and normalize the scraped data
    """
    # Remove empty rows
    df = df.dropna(how='all')
    
    # BUG 3: Not handling columns that might be completely empty
    # This could cause issues if some columns are all NaN
    df = df.fillna('')  # Should be more selective about which columns to fill
    
    return df

def save_data(df, csv_filename='table_data.csv', json_filename='table_data.json'):
    """
    Save DataFrame to CSV and JSON formats
    """
    # Save as CSV
    df.to_csv(csv_filename, index=False)
    
    # Save as JSON
    # Convert DataFrame to list of dictionaries
    data_dict = df.to_dict('records')
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(data_dict, f, indent=2, ensure_ascii=False)
    
    print(f"Data saved to {csv_filename} and {json_filename}")
    return df

# Demo execution
if __name__ == "__main__":
    # Wikipedia page with programming languages table
    url = "https://en.wikipedia.org/wiki/List_of_programming_languages"
    
    try:
        # Scrape the table
        print("Scraping Wikipedia table...")
        df = scrape_wikipedia_table(url, table_index=0)
        
        print(f"Table scraped successfully. Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        
        # Clean and normalize
        df_clean = clean_and_normalize_data(df)
        
        # Save data
        final_df = save_data(df_clean)
        
        print("\nFirst 3 records:")
        print(final_df.head(3).to_string(index=False))
        
    except Exception as e:
        print(f"Error: {e}")
        print("Attempting to use backup table...")
        
        # Fallback to a simpler table if the main one fails
        backup_url = "https://en.wikipedia.org/wiki/Comparison_of_programming_languages"
        try:
            df = scrape_wikipedia_table(backup_url, table_index=0)
            df_clean = clean_and_normalize_data(df)
            final_df = save_data(df_clean)
            print("\nFirst 3 records from backup table:")
            print(final_df.head(3).to_string(index=False))
        except Exception as backup_error:
            print(f"Backup also failed: {backup_error}")