In [2]:
import requests
import json
import re

def fix_json_file(url, output_filename):
    # Fetch the JSON content
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return
    
    data = response.json()
    
    fixed_count = 0
    failed_records = []
    
    for key, record in data.items():
        if isinstance(record, dict) and \
           record.get("translation_notes") == ["Raw response - JSON parsing failed"] and \
           record.get("untranslatable_terms") == [] and \
           record.get("confidence") == 0.7:
            
            try:
                # Clean the translation field to handle common JSON issues
                translation_str = record["translation"]
                
                # Basic cleaning: replace unescaped newlines or excessive whitespace
                translation_str = re.sub(r'\n\s*', ' ', translation_str.strip())
                translation_str = translation_str.replace('\\"', '"').replace('\\n', ' ')
                
                # Try parsing the inner JSON
                inner = json.loads(translation_str)
                
                # Verify expected fields
                if all(k in inner for k in ["translation", "translation_notes", "untranslatable_terms", "confidence"]):
                    # Replace with the inner values
                    record["translation"] = inner["translation"]
                    record["translation_notes"] = inner["translation_notes"]
                    record["untranslatable_terms"] = inner["untranslatable_terms"]
                    record["confidence"] = inner["confidence"]
                    fixed_count += 1
                    print(f"Fixed record: {key}")
                else:
                    failed_records.append((key, "Missing expected fields in inner JSON"))
                    print(f"Missing fields in inner JSON for record {key}: {inner.keys()}")
            
            except json.JSONDecodeError as e:
                failed_records.append((key, f"JSON parsing error: {str(e)}"))
                print(f"Failed to parse inner JSON for record {key}: {translation_str[:100]}... (Error: {str(e)})")
            except Exception as e:
                failed_records.append((key, f"Unexpected error: {str(e)}"))
                print(f"Unexpected error for record {key}: {str(e)}")
    
    print(f"Fixed {fixed_count} records in {url}")
    if failed_records:
        print("\nFailed records:")
        for key, error in failed_records:
            print(f"  {key}: {error}")
    
    # Save the fixed JSON
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# Process only the translations file
fix_json_file(
    "https://raw.githubusercontent.com/lokalkosmos/Lesprit/refs/heads/main/montesquieu_translations_llm.json",
    "montesquieu_translations_llm_fixed.json"
)

Fixed record: French_1748_ch2
Fixed record: French_1750_ch2
Fixed record: French_1758_ch2
Fixed record: French_1784_ch2
Fixed record: French_1803_ch2
Fixed record: Italian_1750_ch2
Fixed record: Italian_1777_ch2
Fixed record: German_1753_ch2
Fixed record: German_1799_ch2
Fixed record: Polish_1777_ch2
Fixed record: French_1750_ch3
Fixed record: French_1758_ch3
Fixed record: French_1784_ch3
Fixed record: French_1803_ch3
Fixed record: Italian_1750_ch3
Fixed record: Italian_1777_ch3
Fixed record: German_1753_ch3
Fixed record: German_1799_ch3
Fixed record: Polish_1777_ch3
Fixed record: French_1803_ch4
Fixed record: Italian_1750_ch4
Fixed record: Italian_1777_ch4
Fixed record: French_1748_ch5
Fixed record: French_1750_ch5
Fixed record: French_1758_ch5
Fixed record: French_1784_ch5
Fixed record: French_1803_ch5
Fixed record: Italian_1750_ch5
Fixed record: Italian_1777_ch5
Fixed record: German_1753_ch5
Fixed record: German_1799_ch5
Fixed record: Polish_1777_ch5
Fixed record: Italian_1750_ch6
F