In [10]:
import json
import pandas as pd
from pathlib import Path
import re
import os

def flatten_cricket_match(json_file_path, output_csv_path=None):
    """
    Flatten a single cricket match JSON file into a denormalized CSV with all requested features.
    
    Args:
        json_file_path: Path to the cricket match JSON file
        output_csv_path: Path to save the CSV output (optional)
    
    Returns:
        DataFrame containing the flattened data
    """
    # Load JSON data
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    match_info = data.get('info', {})
    innings_list = data.get('innings', [])
    
    # Extract match ID from filename
    match_id = Path(json_file_path).stem
    
    # Extract match-level information
    match_data = {
        'match_id': match_id,
        'match_type': match_info.get('match_type'),
        'match_event_name': match_info.get('event', {}).get('name'),
        'season': match_info.get('season'),
        'date': match_info.get('dates', [''])[0] if match_info.get('dates') else None,
        'venue': match_info.get('venue'),
        'city': match_info.get('city'),
        'gender': match_info.get('gender'),
        'toss_winner': match_info.get('toss', {}).get('winner'),
        'toss_decision': match_info.get('toss', {}).get('decision'),
    }
    
    # Handle different outcome formats
    outcome = match_info.get('outcome', {})
    if 'winner' in outcome:
        match_data['winner'] = outcome.get('winner')
    else:
        match_data['winner'] = None
    
    match_data['outcome_by_runs'] = outcome.get('by', {}).get('runs')
    match_data['outcome_by_wickets'] = outcome.get('by', {}).get('wickets')
    match_data['outcome_method'] = outcome.get('method')
    
    # If there's no winner but there's a result, record it (e.g., "no result", "tie")
    if not match_data['winner'] and 'result' in outcome:
        match_data['outcome_method'] = outcome.get('result')
    
    # Player of match
    match_data['player_of_match'] = ','.join(match_info.get('player_of_match', []))
    
    # Get team information
    teams = match_info.get('teams', [])
    team1 = teams[0] if len(teams) > 0 else None
    team2 = teams[1] if len(teams) > 1 else None
    
    # List to store all ball-by-ball records
    all_deliveries = []
    
    # Process each innings
    for innings_idx, innings in enumerate(innings_list):
        innings_num = innings_idx + 1
        batting_team = innings.get('team')
        bowling_team = team2 if batting_team == team1 else team1
        
        # Process each over
        for over in innings.get('overs', []):
            over_num = over.get('over')
            
            # Process each delivery
            for ball_idx, delivery in enumerate(over.get('deliveries', [])):
                # Create a record for this delivery, starting with match data
                delivery_record = match_data.copy()
                
                # Add innings and over info
                delivery_record.update({
                    'innings': innings_num,
                    'over': over_num,
                    'ball_number': ball_idx + 1,
                    'batting_team': batting_team,
                    'bowling_team': bowling_team,
                    'batter': delivery.get('batter'),
                    'bowler': delivery.get('bowler'),
                    'non_striker': delivery.get('non_striker')
                })
                
                # Add runs info
                runs_info = delivery.get('runs', {})
                delivery_record.update({
                    'runs_off_bat': runs_info.get('batter', 0),
                    'extras': runs_info.get('extras', 0),
                    'total_runs': runs_info.get('total', 0)
                })
                
                # Add detailed extras info
                extras = delivery.get('extras', {})
                delivery_record.update({
                    'wides': extras.get('wides', 0) if 'wides' in extras else 0,
                    'noballs': extras.get('noballs', 0) if 'noballs' in extras else 0,
                    'byes': extras.get('byes', 0) if 'byes' in extras else 0,
                    'legbyes': extras.get('legbyes', 0) if 'legbyes' in extras else 0,
                    'penalty': extras.get('penalty', 0) if 'penalty' in extras else 0
                })
                
                # Add wicket info if present
                wickets = delivery.get('wickets', [])
                if wickets:
                    # Primary wicket
                    wicket = wickets[0]  # Take the first wicket if multiple
                    fielders = wicket.get('fielders', [])
                    fielders_str = ','.join([f.get('name', '') for f in fielders]) if fielders else None
                    
                    delivery_record.update({
                        'wicket_type': wicket.get('kind'),
                        'player_dismissed': wicket.get('player_out'),
                        'fielders': fielders_str
                    })
                    
                    # Additional wicket if present (rare)
                    if len(wickets) > 1:
                        delivery_record.update({
                            'other_wicket_type': wickets[1].get('kind'),
                            'other_player_dismissed': wickets[1].get('player_out')
                        })
                    else:
                        delivery_record.update({
                            'other_wicket_type': None,
                            'other_player_dismissed': None
                        })
                else:
                    delivery_record.update({
                        'wicket_type': None,
                        'player_dismissed': None,
                        'fielders': None,
                        'other_wicket_type': None,
                        'other_player_dismissed': None
                    })
                
                # Add to our list of deliveries
                all_deliveries.append(delivery_record)
    
    # Convert to DataFrame
    df = pd.DataFrame(all_deliveries)
    
    # Ensure all requested columns are present (add empty ones if missing)
    required_columns = [
        'match_id', 'match_type', 'match_event_name', 'season', 'date', 'venue', 'city', 'gender',
        'toss_winner', 'toss_decision', 'winner', 'outcome_by_runs', 'outcome_by_wickets', 
        'outcome_method', 'player_of_match', 'innings', 'over', 'ball_number', 'batting_team', 
        'bowling_team', 'batter', 'bowler', 'non_striker', 'fielders', 'runs_off_bat', 
        'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 
        'player_dismissed', 'other_wicket_type', 'other_player_dismissed'
    ]
    
    for col in required_columns:
        if col not in df.columns:
            df[col] = None
    
    # Reorder columns to match the required order
    df = df[required_columns]
    
    # Save to CSV if output path is provided
    if output_csv_path:
        df.to_csv(output_csv_path, index=False)
        print(f"Flattened data saved to {output_csv_path}")
    
    return df

# Example usage
json_file_path = "./extracted_data_json/1466426.json"  # Update with your actual file path
output_csv_path = "./flattened_data/sample_flattened_35_features.csv"  # Where you want to save the output
# Run the function
df = flatten_cricket_match(json_file_path, output_csv_path)

# Display summary statistics
print(f"Processed match: {df['match_id'].iloc[0]}")
print(f"Total deliveries: {len(df)}")
print(f"Match type: {df['match_type'].iloc[0]}")
print(f"Teams: {df['batting_team'].iloc[0]} vs {df['bowling_team'].iloc[0]}")

Flattened data saved to ./flattened_data/sample_flattened_35_features.csv
Processed match: 1466426
Total deliveries: 596
Match type: ODI
Teams: Australia vs India
