# Jugend musiziert Data Analysis

Analysis of the scraped Jugend musiziert tournament data using Polars for efficient data processing and exploration.

In [None]:
# Import Required Libraries
import polars as pl
import json
import os
from pathlib import Path

# Display settings
pl.Config.set_fmt_str_lengths(100)
print(f"Polars version: {pl.__version__}")

## Load Scraped Data

Loading the JSON data from the scraper output.

In [None]:
# Load the scraped JSON data
data_path = Path("jugend_musiziert_data.json")

if data_path.exists():
    with open(data_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    print(f"‚úì Loaded data from {data_path}")
    print(f"  Top-level keys: {list(raw_data.keys())}")
else:
    print(f"‚úó Data file not found at {data_path}")
    print(f"  Please run the scraper first: uv run python scraper/scraper.py")

## Inspect Data Structure

Examine the structure and content of the loaded data.

In [None]:
# Inspect the raw data structure
if 'raw_data' in dir():
    # Display data structure
    print("=" * 60)
    print("DATA STRUCTURE")
    print("=" * 60)
    
    # Check for API endpoints
    if 'api_endpoints' in raw_data:
        endpoints_df = pl.DataFrame({
            'url': raw_data['api_endpoints']
        })
        print(f"\nüìç API Endpoints ({len(endpoints_df)} found):")
        print(endpoints_df)
    
    # Check for embedded data
    if 'embedded_data' in raw_data:
        print(f"\nüìä Embedded Data Blocks: {len(raw_data['embedded_data'])}")
        for i, block in enumerate(raw_data['embedded_data'][:2]):  # Show first 2
            print(f"\n  Block {i+1}:")
            print(f"    Keys: {list(block.keys())}")
            for key, value in block.items():
                if isinstance(value, dict):
                    print(f"    {key}: {list(value.keys())[:5]}...")
                elif isinstance(value, list):
                    print(f"    {key}: list of {len(value)} items")
                else:
                    print(f"    {key}: {type(value).__name__}")
    
    # Check for metadata
    if 'metadata' in raw_data:
        print(f"\n‚úì Metadata: {raw_data['metadata']}")

## Convert to Polars DataFrames

Convert the JSON data into Polars DataFrames for efficient analysis.

In [None]:
# Create DataFrames from API endpoints
if 'raw_data' in dir() and 'api_endpoints' in raw_data:
    endpoints_df = pl.DataFrame({
        'url': raw_data['api_endpoints']
    })
    print("API Endpoints DataFrame:")
    print(f"Shape: {endpoints_df.shape}")
    print(endpoints_df)
    
# Prepare embedded data for analysis
if 'raw_data' in dir() and 'embedded_data' in raw_data:
    print("\n" + "="*60)
    print("EMBEDDED DATA ANALYSIS")
    print("="*60)
    
    # Convert embedded data blocks to Polars
    data_blocks = []
    for i, block in enumerate(raw_data['embedded_data']):
        flat_block = {
            'block_id': i,
            'keys': str(list(block.keys())),
            'has_data': 'data' in block,
        }
        data_blocks.append(flat_block)
    
    if data_blocks:
        blocks_df = pl.DataFrame(data_blocks)
        print(f"\nTotal embedded blocks: {len(blocks_df)}")
        print(blocks_df)

## Exploratory Data Analysis

Analyze the structure and content of the tournament data.

In [None]:
# Analyze the structure of embedded data in detail
if 'raw_data' in dir() and 'embedded_data' in raw_data:
    print("="*60)
    print("DETAILED EMBEDDED DATA ANALYSIS")
    print("="*60)
    
    # Analyze data content
    for i, block in enumerate(raw_data['embedded_data']):
        if 'data' in block and isinstance(block['data'], dict):
            data_content = block['data']
            
            # Count different types of content
            list_keys = [k for k, v in data_content.items() if isinstance(v, list)]
            dict_keys = [k for k, v in data_content.items() if isinstance(v, dict)]
            
            print(f"\nBlock {i+1}:")
            print(f"  Total keys: {len(data_content)}")
            print(f"  List entries: {len(list_keys)} ({', '.join(list_keys[:3])}...)")
            print(f"  Dict entries: {len(dict_keys)} ({', '.join(dict_keys[:3])}...)")
            
            # Analyze list content (potential participants)
            for key in list_keys[:1]:  # Analyze first list
                items = data_content[key]
                if items and isinstance(items, list):
                    print(f"\n  Analyzing '{key}' ({len(items)} items):")
                    if isinstance(items[0], dict):
                        sample_keys = list(items[0].keys())
                        print(f"    Item structure: {sample_keys}")
                    break

## Summary Statistics

Generate summary statistics and insights about the scraped data.

In [None]:
# Generate summary report
if 'raw_data' in dir():
    print("\n" + "="*60)
    print("SUMMARY REPORT")
    print("="*60)
    
    summary = {
        'Metric': [],
        'Value': []
    }
    
    # Count API endpoints
    if 'api_endpoints' in raw_data:
        summary['Metric'].append('API Endpoints Found')
        summary['Value'].append(str(len(raw_data['api_endpoints'])))
    
    # Count embedded data blocks
    if 'embedded_data' in raw_data:
        summary['Metric'].append('Embedded Data Blocks')
        summary['Value'].append(str(len(raw_data['embedded_data'])))
        
        # Count total items across all blocks
        total_items = 0
        for block in raw_data['embedded_data']:
            if 'data' in block and isinstance(block['data'], dict):
                for v in block['data'].values():
                    if isinstance(v, list):
                        total_items += len(v)
        summary['Metric'].append('Total Items Found')
        summary['Value'].append(str(total_items))
    
    # Create summary DataFrame
    if summary['Metric']:
        summary_df = pl.DataFrame(summary)
        print("\n" + summary_df.to_string())
    
    print("\n‚úì Data loading and initial analysis complete!")
    print("  Next: Run the cells above to explore the data further.")