In [6]:
import os
import json
import pandas as pd

def add_player_roles():
    squads_dir = 'squads'
    dataset_base_dir = 'dataset'

    for year in range(2015, 2025):
        print(f"\nProcessing {year}...")

        # Load JSON data
        json_path = os.path.join(squads_dir, f'{year}.json')
        if not os.path.exists(json_path):
            print(f"  Skipping {year} - JSON file not found")
            continue

        with open(json_path, 'r') as f:
            squad_data = json.load(f)

        # Build role mapping from all teams
        role_mapping = {}
        for team, categories in squad_data.items():
            for category, players in categories.items():
                normalized_category = category.replace(" Squad", "")
                for player in players:
                    # Handle different name formats
                    key = player.strip().lower()
                    role_mapping[key] = normalized_category
                    # Also store without middle initials
                    if ' ' in key and '.' not in key:
                        simple_name = ' '.join([part for part in key.split() if not part.isupper()])
                        role_mapping[simple_name] = normalized_category

        # Process CSV files
        dataset_year_dir = os.path.join(dataset_base_dir, str(year))
        if not os.path.exists(dataset_year_dir):
            print(f"  Skipping {year} - Dataset folder not found")
            continue

        for csv_file in os.listdir(dataset_year_dir):
            if not csv_file.endswith('.csv'):
                continue

            file_path = os.path.join(dataset_year_dir, csv_file)
            df = pd.read_csv(file_path)
            modified = False

            # Add batsman role column
            if 'batsman' in df.columns:
                df.insert(
                    df.columns.get_loc('batsman') + 1,
                    'batsman_role',
                    df['batsman'].apply(lambda x: _get_role(x, role_mapping, 'batsman')))
                modified = True

            # Add bowler role column
            if 'bowler' in df.columns:
                df.insert(
                    df.columns.get_loc('bowler') + 1,
                    'bowler_role',
                    df['bowler'].apply(lambda x: _get_role(x, role_mapping, 'bowler')))
                modified = True

            if modified:
                df.to_csv(file_path, index=False)
                print(f"  Updated {csv_file}")

    print("\nProcessing completed!")

def _get_role(player_name, role_mapping, role_type):
    if pd.isna(player_name):
        return pd.NA

    # Normalize the name
    name = str(player_name).strip().lower()

    # Try different variations
    variations = [
        name,
        name.replace('.', ''),  # M.S. Dhoni -> ms dhoni
        ' '.join(name.split()),  # Extra spaces
        name.replace("'", ""),   # O'Conner -> oconner
    ]

    for variation in variations:
        if variation in role_mapping:
            role = role_mapping[variation]
            # Special handling for role types
            if role_type == 'bowler' and role == 'Batters':
                return pd.NA
            return role

    return pd.NA

if __name__ == '__main__':
    add_player_roles()



Processing 2015...
  Updated 33b.csv
  Updated 6a.csv
  Updated 4b.csv
  Updated 38b.csv
  Updated 47a.csv
  Updated 20a.csv
  Updated 37a.csv
  Updated 23a.csv
  Updated 31a.csv
  Updated 59a.csv
  Updated 1b.csv
  Updated 40a.csv
  Updated 43a.csv
  Updated 30b.csv
  Updated 54a.csv
  Updated 21b.csv
  Updated 45b.csv
  Updated 34a.csv
  Updated 29b.csv
  Updated 48a.csv
  Updated 18a.csv
  Updated 50a.csv
  Updated 29a.csv
  Updated 36b.csv
  Updated 37b.csv
  Updated 32b.csv
  Updated 13a.csv
  Updated 28b.csv
  Updated 35a.csv
  Updated 18b.csv
  Updated 54b.csv
  Updated 55b.csv
  Updated 30a.csv
  Updated 33a.csv
  Updated 17b.csv
  Updated 9a.csv
  Updated 41a.csv
  Updated 12a.csv
  Updated 7b.csv
  Updated 60a.csv
  Updated 57a.csv
  Updated 36a.csv
  Updated 50b.csv
  Updated 58a.csv
  Updated 4a.csv
  Updated 58b.csv
  Updated 39b.csv
  Updated 21a.csv
  Updated 26a.csv
  Updated 3b.csv
  Updated 44b.csv
  Updated 57b.csv
  Updated 56b.csv
  Updated 55a.csv
  Updated 15b.c