# Limpieza

## .xls a .csv
Los datos del Mineduc se exportan como archivos .xls, teniendo la informacion en diferentes 'sheets' dentro del archivo para cada uno de ellos. Debido a eso, tenemos los datos crudos almacenados y vamos a generar archivos .csv identificando el 'sheet' correcto que contiene la informacion.

In [15]:
import pandas as pd
from pathlib import Path
from collections import defaultdict
import re

def parse_html_excel_file(file_path):
    """Parse HTML file masquerading as Excel and extract table data"""
    
    file_path = Path(file_path)  # Convert to Path object
    print(f"\n📄 Processing: {file_path.name}")
    
    try:
        # Read all tables from HTML
        tables = pd.read_html(str(file_path), encoding='utf-8')
        
        if not tables:
            return {'success': False, 'error': 'No tables found in HTML'}
        
        print(f"  Found {len(tables)} tables")
        
        # Look for table with required headers
        required_headers = ['CODIGO', 'DISTRITO', 'DEPARTAMENTO', 'MUNICIPIO']
        target_table = None
        target_index = None
        
        for i, df in enumerate(tables):
            print(f"  Table {i+1}: {df.shape[0]} rows, {df.shape[1]} columns")
            print(f"    Columns: {list(df.columns)}")
            
            # Check if this table has the required headers
            df_columns_upper = [str(col).upper().strip() for col in df.columns]
            if all(header in df_columns_upper for header in required_headers):
                target_table = df
                target_index = i
                print(f"    ✅ Found target table!")
                break
            else:
                # Check first few rows in case headers are in data
                if len(df) > 0:
                    first_row_upper = [str(cell).upper().strip() for cell in df.iloc[0]]
                    if all(header in first_row_upper for header in required_headers):
                        # Headers are in first row, make them column names
                        df.columns = df.iloc[0]
                        df = df.drop(df.index[0]).reset_index(drop=True)
                        target_table = df
                        target_index = i
                        print(f"    ✅ Found target table (headers in first row)!")
                        break
        
        if target_table is None:
            return {
                'success': False, 
                'error': f'No table found with required headers. Available columns: {[list(t.columns) for t in tables]}'
            }
        
        # Clean the data
        target_table = target_table.dropna(how='all')  # Remove empty rows
        
        print(f"  📊 Target table details:")
        print(f"    Shape: {target_table.shape}")
        print(f"    Columns: {list(target_table.columns)}")
        print(f"    Sample data:")
        print(target_table.head())
        
        return {
            'success': True,
            'data': target_table,
            'table_index': target_index,
            'total_tables': len(tables)
        }
        
    except Exception as e:
        return {'success': False, 'error': str(e)}

def sanitize_filename(text):
    """Convert text to lowercase, replace spaces with underscores, remove special chars"""
    filename = text.lower().replace(' ', '_')
    return re.sub(r'[^\w_.]', '', filename)

def process_html_files_directory(input_dir, output_dir):
    """Process all HTML files in directory"""
    
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Find all .xls files (which are actually HTML)
    files = list(input_path.glob("*.xls"))
    
    print(f"Found {len(files)} .xls files to process")
    
    successful_files = []
    failed_files = []
    all_departamentos = defaultdict(list)
    
    for file_path in files:
        result = parse_html_excel_file(file_path)
        
        if result['success']:
            df = result['data']
            
            # Group by DEPARTAMENTO and save CSVs
            departamentos = []
            for departamento, group in df.groupby('DEPARTAMENTO'):
                filename = f"datos_{sanitize_filename(departamento)}.csv"
                
                output_file = output_path / filename
                group.to_csv(output_file, index=False)
                
                departamentos.append({
                    'name': departamento,
                    'filename': filename,
                    'rows': len(group)
                })
                
                # Track for duplicates
                all_departamentos[departamento].append({
                    'source_file': file_path.name,
                    'csv_file': filename,
                    'rows': len(group)
                })
            
            successful_files.append({
                'file': file_path.name,
                'departamentos': departamentos,
                'total_rows': len(df)
            })
            
            print(f"  ✅ SUCCESS - Created {len(departamentos)} CSV files")
            for dept in departamentos:
                print(f"    - {dept['filename']} ({dept['rows']} rows)")
        
        else:
            failed_files.append({
                'file': file_path.name,
                'error': result['error']
            })
            print(f"  ❌ FAILED - {result['error']}")
    
    # Print summary
    print("\n" + "="*60)
    print("PROCESSING SUMMARY")
    print("="*60)
    
    print(f"\nTotal files: {len(files)}")
    print(f"✅ Successful: {len(successful_files)}")
    print(f"❌ Failed: {len(failed_files)}")
    
    if successful_files:
        print(f"\n✅ SUCCESSFUL FILES:")
        for item in successful_files:
            print(f"  - {item['file']}: {len(item['departamentos'])} departments, {item['total_rows']} total rows")
    
    if failed_files:
        print(f"\n❌ FAILED FILES:")
        for item in failed_files:
            print(f"  - {item['file']}: {item['error']}")
    
    # Check for duplicates
    duplicates = {name: sources for name, sources in all_departamentos.items() if len(sources) > 1}
    if duplicates:
        print(f"\n⚠️ DUPLICATE DEPARTAMENTOS:")
        for dept_name, sources in duplicates.items():
            print(f"  {dept_name}: appears in {len(sources)} files")
    else:
        print("\n✅ No duplicate departamentos found")

# Test with a single file first
test_result = parse_html_excel_file("data/raw/establecimiento.xls")
if test_result['success']:
    print("\n🎉 Single file test successful!")
    
    # Now process all files
    print("\n" + "="*60)
    print("PROCESSING ALL FILES")
    print("="*60)
    process_html_files_directory("data/raw", "data/csv")
else:
    print(f"❌ Single file test failed: {test_result['error']}")


📄 Processing: establecimiento.xls
  Found 10 tables
  Table 1: 4 rows, 4 columns
    Columns: [0, 1, 2, 3]
  Table 2: 1 rows, 4 columns
    Columns: [0, 1, 2, 3]
  Table 3: 1 rows, 3 columns
    Columns: [0, 1, 2]
  Table 4: 2 rows, 3 columns
    Columns: [0, 1, 2]
  Table 5: 1 rows, 1 columns
    Columns: [0]
  Table 6: 1 rows, 1 columns
    Columns: [0]
  Table 7: 1 rows, 1 columns
    Columns: [0]
  Table 8: 10 rows, 2 columns
    Columns: [0, 1]
  Table 9: 1 rows, 1 columns
    Columns: [0]
  Table 10: 296 rows, 17 columns
    Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    ✅ Found target table (headers in first row)!
  📊 Target table details:
    Shape: (294, 17)
    Columns: ['CODIGO', 'DISTRITO', 'DEPARTAMENTO', 'MUNICIPIO', 'ESTABLECIMIENTO', 'DIRECCION', 'TELEFONO', 'SUPERVISOR', 'DIRECTOR', 'NIVEL', 'SECTOR', 'AREA', 'STATUS', 'MODALIDAD', 'JORNADA', 'PLAN', 'DEPARTAMENTAL']
    Sample data:
0         CODIGO DISTRITO  DEPARTAMENTO MUNICIPIO  \
0  16-0