# Pokémon battles — LightGBM with 10-fold CV
Notebook che implementa LightGBM con: feature engineering completo, 10-fold CV, Optuna hyperparameter tuning, e submission finale.

# Load data

In [2]:
import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# --- Percorsi ---
train_file_path = 'train.jsonl'
test_file_path = 'test.jsonl'

def load_jsonl(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

print('Caricamento dati...')
train_raw = load_jsonl(train_file_path)
test_raw = load_jsonl(test_file_path)
print(f'Train records: {len(train_raw)}, Test records: {len(test_raw)}')

Caricamento dati...
Train records: 10000, Test records: 5000


# Feature Engineering

In [3]:
import math
from collections import Counter

# === TYPE CHART (Gen 1) ===
TYPE_CHART = {
    'normal': {'rock': 0.5, 'ghost': 0},
    'fire': {'fire': 0.5, 'water': 0.5, 'grass': 2, 'ice': 2, 'bug': 2, 'rock': 0.5, 'dragon': 0.5},
    'water': {'fire': 2, 'water': 0.5, 'grass': 0.5, 'ground': 2, 'rock': 2, 'dragon': 0.5},
    'grass': {'fire': 0.5, 'water': 2, 'grass': 0.5, 'poison': 0.5, 'ground': 2, 'flying': 0.5, 'bug': 0.5, 'rock': 2, 'dragon': 0.5},
    'electric': {'water': 2, 'grass': 0.5, 'electric': 0.5, 'ground': 0, 'flying': 2, 'dragon': 0.5},
    'ice': {'fire': 0.5, 'water': 0.5, 'grass': 2, 'ground': 2, 'flying': 2, 'dragon': 2},
    'fighting': {'normal': 2, 'ice': 2, 'poison': 0.5, 'flying': 0.5, 'psychic': 0.5, 'bug': 0.5, 'rock': 2, 'ghost': 0},
    'poison': {'grass': 2, 'poison': 0.5, 'ground': 0.5, 'bug': 2, 'rock': 0.5, 'ghost': 0.5},
    'ground': {'fire': 2, 'grass': 0.5, 'electric': 2, 'poison': 2, 'flying': 0, 'bug': 0.5, 'rock': 2},
    'flying': {'grass': 2, 'electric': 0.5, 'fighting': 2, 'bug': 2, 'rock': 0.5},
    'psychic': {'fighting': 2, 'poison': 2, 'psychic': 0.5, 'ghost': 0},
    'bug': {'fire': 0.5, 'grass': 2, 'fighting': 0.5, 'poison': 2, 'flying': 0.5, 'psychic': 2, 'ghost': 0.5},
    'rock': {'fire': 2, 'ice': 2, 'fighting': 0.5, 'ground': 0.5, 'flying': 2, 'bug': 2},
    'ghost': {'normal': 0, 'psychic': 0, 'ghost': 2},
    'dragon': {'dragon': 2}
}

def get_effectiveness(attack_type: str, defense_types: list) -> float:
    if not attack_type or not defense_types:
        return 1.0
    eff = 1.0
    for d in defense_types:
        eff *= TYPE_CHART.get(attack_type, {}).get(d, 1.0)
    return eff

def calculate_type_advantage(team1: list, team2_lead: dict) -> dict:
    out = {'p1_vs_lead_avg_effectiveness': 0.0, 'p1_vs_lead_max_effectiveness': 0.0, 'p1_super_effective_options': 0}
    if not team1 or not team2_lead:
        return out
    lead_types = [t.lower() for t in team2_lead.get('types', [])]
    if not lead_types:
        return out
    effs = []
    for p in team1:
        p_types = [t.lower() for t in p.get('types', [])]
        max_eff = 0.0
        for pt in p_types:
            max_eff = max(max_eff, get_effectiveness(pt, lead_types))
        effs.append(max_eff)
    if not effs:
        return out
    out['p1_vs_lead_avg_effectiveness'] = float(np.mean(effs))
    out['p1_vs_lead_max_effectiveness'] = float(np.max(effs))
    out['p1_super_effective_options'] = int(sum(1 for e in effs if e >= 2))
    return out

def _entropy(counter: Counter) -> float:
    total = sum(counter.values())
    if total == 0:
        return 0.0
    ent = 0.0
    for v in counter.values():
        p = v / total
        if p > 0:
            ent -= p * math.log(p, 2)
    return ent

def team_aggregate_features(team: list, prefix: str = 'p1_') -> dict:
    stats = ['base_hp','base_atk','base_def','base_spa','base_spd','base_spe']
    out = {}
    vals = {s: [] for s in stats}
    levels = []
    types_counter = Counter()
    names = []
    for p in team:
        names.append(p.get('name',''))
        for s in stats:
            vals[s].append(p.get(s, 0))
        levels.append(p.get('level', 0))
        for t in p.get('types', []):
            types_counter[t.lower()] += 1
    for s in stats:
        arr = np.array(vals[s], dtype=float)
        out[f'{prefix}{s}_sum'] = float(arr.sum())
        out[f'{prefix}{s}_mean'] = float(arr.mean())
        out[f'{prefix}{s}_max'] = float(arr.max())
        out[f'{prefix}{s}_min'] = float(arr.min())
        out[f'{prefix}{s}_std'] = float(arr.std())
    level_arr = np.array(levels, dtype=float)
    out[f'{prefix}level_mean'] = float(level_arr.mean()) if level_arr.size else 0.0
    out[f'{prefix}level_sum'] = float(level_arr.sum()) if level_arr.size else 0.0
    out[f'{prefix}n_unique_types'] = int(len(types_counter))
    common_types = ['normal','fire','water','electric','grass','psychic','ice','dragon','rock','ground','flying']
    for t in common_types:
        out[f'{prefix}type_{t}_count'] = int(types_counter.get(t, 0))
    out[f'{prefix}lead_name'] = names[0] if names else ''
    out[f'{prefix}n_unique_names'] = int(len(set(names)))
    out[f'{prefix}type_entropy'] = float(_entropy(types_counter))
    spe_arr = np.array(vals['base_spe'], dtype=float)
    out[f'{prefix}spe_p25'] = float(np.percentile(spe_arr, 25)) if spe_arr.size else 0.0
    out[f'{prefix}spe_p50'] = float(np.percentile(spe_arr, 50)) if spe_arr.size else 0.0
    out[f'{prefix}spe_p75'] = float(np.percentile(spe_arr, 75)) if spe_arr.size else 0.0
    return out

def lead_vs_lead_features(p1_lead: dict, p2_lead: dict) -> dict:
    out = {}
    stats = ['base_hp','base_atk','base_def','base_spa','base_spd','base_spe']
    for s in stats:
        out[f'lead_diff_{s}'] = float(p1_lead.get(s,0) - p2_lead.get(s,0))
    out['lead_speed_advantage'] = float(p1_lead.get('base_spe',0) - p2_lead.get('base_spe',0))
    p1_types = [t.lower() for t in p1_lead.get('types', [])]
    p2_types = [t.lower() for t in p2_lead.get('types', [])]
    max_eff = 0.0
    for pt in p1_types:
        max_eff = max(max_eff, get_effectiveness(pt, p2_types))
    out['lead_p1_vs_p2_effectiveness'] = float(max_eff)
    return out

def lead_aggregate_features(pokemon: dict, prefix: str = 'p2_lead_') -> dict:
    out = {}
    stats = ['base_hp','base_atk','base_def','base_spa','base_spd','base_spe']
    for s in stats:
        out[f'{prefix}{s}'] = float(pokemon.get(s,0))
    out[f'{prefix}level'] = int(pokemon.get('level',0))
    types = [x.lower() for x in pokemon.get('types', [])]
    common_types = ['normal','fire','water','electric','grass','psychic','ice','dragon','rock','ground','flying']
    for t in common_types:
        out[f'{prefix}type_{t}'] = int(t in types)
    out[f'{prefix}name'] = pokemon.get('name','')
    out[f'{prefix}n_unique_types'] = int(len(set(types)))
    return out

def summary_from_timeline(timeline: list, p1_team: list) -> dict:
    out = {}
    if not timeline:
        return {'tl_p1_moves':0,'tl_p2_moves':0,'tl_p1_est_damage':0.0,'tl_p2_est_damage':0.0,'damage_diff':0.0}
    p1_moves = p2_moves = 0
    p1_damage = p2_damage = 0.0
    p1_last_active = p2_last_active = ''
    p1_last_hp = p2_last_hp = np.nan
    p1_fainted = p2_fainted = 0
    p1_fainted_names = set()
    p2_fainted_names = set()
    last_p1_hp = {}
    last_p2_hp = {}
    p1_comeback_kos = 0
    p2_comeback_kos = 0
    p1_inflicted_statuses = Counter()
    p2_inflicted_statuses = Counter()
    p1_pokemon_statuses = {}
    p2_pokemon_statuses = {}
    p1_move_type_counts = Counter()
    p2_move_type_counts = Counter()
    p1_damage_first2 = 0.0
    p2_damage_first2 = 0.0
    p1_dmg_by_turn = {}
    p2_dmg_by_turn = {}
    seen_turns = set()
    first_ko_turn_p1_taken = None
    first_ko_turn_p1_inflicted = None
    early_threshold = 10
    p1_kos_early = p1_kos_late = 0
    p2_kos_early = p2_kos_late = 0

    for turn in timeline[:30]:
        prev_p1_fainted, prev_p2_fainted = p1_fainted, p2_fainted
        p1_state = turn.get('p1_pokemon_state',{}) or {}
        p2_state = turn.get('p2_pokemon_state',{}) or {}
        tnum = turn.get('turn', None)
        if tnum is None:
            tnum = (len(seen_turns) + 1)
        seen_turns.add(tnum)

        if p1_state.get('name'):
            p1_last_active = p1_state.get('name')
        if p2_state.get('name'):
            p2_last_active = p2_state.get('name')

        if p1_state.get('fainted') and p1_state.get('name') not in p1_fainted_names:
            p1_fainted += 1
            p1_fainted_names.add(p1_state.get('name'))
            if first_ko_turn_p1_taken is None:
                first_ko_turn_p1_taken = tnum
            if tnum <= early_threshold: p2_kos_early += 1
            else: p2_kos_late += 1
        if p2_state.get('fainted') and p2_state.get('name') not in p2_fainted_names:
            p2_fainted += 1
            p2_fainted_names.add(p2_state.get('name'))
            if first_ko_turn_p1_inflicted is None:
                first_ko_turn_p1_inflicted = tnum
            if tnum <= early_threshold: p1_kos_early += 1
            else: p1_kos_late += 1

        p2_name, p2_hp = p2_state.get('name'), p2_state.get('hp_pct')
        if p2_name and p2_hp is not None:
            prev_hp = last_p2_hp.get(p2_name)
            if prev_hp is not None:
                delta = max(0.0, prev_hp - p2_hp)
                p1_damage += delta
                p1_dmg_by_turn[tnum] = p1_dmg_by_turn.get(tnum, 0.0) + delta
                if turn.get('turn',999) <= 2:
                    p1_damage_first2 += delta
            last_p2_hp[p2_name] = p2_hp

        p1_name, p1_hp = p1_state.get('name'), p1_state.get('hp_pct')
        if p1_name and p1_hp is not None:
            prev_hp = last_p1_hp.get(p1_name)
            if prev_hp is not None:
                delta = max(0.0, prev_hp - p1_hp)
                p2_damage += delta
                p2_dmg_by_turn[tnum] = p2_dmg_by_turn.get(tnum, 0.0) + delta
                if turn.get('turn',999) <= 2:
                    p2_damage_first2 += delta
            last_p1_hp[p1_name] = p1_hp

        damage_diff_so_far = p1_damage - p2_damage
        if p2_fainted > prev_p2_fainted and damage_diff_so_far < -1.0:
            p1_comeback_kos += 1
        if p1_fainted > prev_p1_fainted and damage_diff_so_far > 1.0:
            p2_comeback_kos += 1

        p2_status = p2_state.get('status')
        if p2_name and p2_status and p2_pokemon_statuses.get(p2_name) != p2_status:
            p1_inflicted_statuses[p2_status] += 1
            p2_pokemon_statuses[p2_name] = p2_status
        p1_status = p1_state.get('status')
        if p1_name and p1_status and p1_pokemon_statuses.get(p1_name) != p1_status:
            p2_inflicted_statuses[p1_status] += 1
            p1_pokemon_statuses[p1_name] = p1_status

        p1_move = turn.get('p1_move_details') or {}
        p2_move = turn.get('p2_move_details') or {}
        if p1_move and p1_move.get('type'):
            p1_move_type_counts[(p1_move.get('type') or '').lower()] += 1
        if p2_move and p2_move.get('type'):
            p2_move_type_counts[(p2_move.get('type') or '').lower()] += 1
        if turn.get('p1_move_details'):
            p1_moves += 1
        if turn.get('p2_move_details'):
            p2_moves += 1
        p1_last_hp = p1_state.get('hp_pct', np.nan)
        p2_last_hp = p2_state.get('hp_pct', np.nan)

    out['tl_p1_moves'] = int(p1_moves)
    out['tl_p2_moves'] = int(p2_moves)
    out['tl_p1_est_damage'] = float(p1_damage)
    out['tl_p2_est_damage'] = float(p2_damage)
    out['tl_p1_fainted'] = int(p1_fainted)
    out['tl_p2_fainted'] = int(p2_fainted)
    turns_count = max(1, len(seen_turns))
    out['tl_p1_fainted_rate'] = float(out['tl_p1_fainted'] / turns_count)
    out['tl_p2_fainted_rate'] = float(out['tl_p2_fainted'] / turns_count)
    out['damage_diff'] = float(p1_damage - p2_damage)
    out['fainted_diff'] = int(p1_fainted - p2_fainted)
    out['tl_p1_last_hp'] = float(p1_last_hp) if not np.isnan(p1_last_hp) else 0.0
    out['tl_p2_last_hp'] = float(p2_last_hp) if not np.isnan(p2_last_hp) else 0.0
    out['tl_p1_last_active'] = p1_last_active
    out['tl_p2_last_active'] = p2_last_active
    if p1_team:
        p1_total_hp_sum = sum(p.get('base_hp',0) for p in p1_team)
        p1_avg_def = np.mean([p.get('base_def',0) for p in p1_team] or [0])
        p1_avg_spd = np.mean([p.get('base_spd',0) for p in p1_team] or [0])
        out['tl_p2_damage_vs_p1_hp_pool'] = float(p2_damage / (p1_total_hp_sum + 1e-6))
        out['tl_p1_defensive_endurance'] = float((p1_avg_def + p1_avg_spd) / (p2_damage + 1e-6))
    out['tl_p1_comeback_kos'] = int(p1_comeback_kos)
    out['tl_p2_comeback_kos'] = int(p2_comeback_kos)
    out['tl_comeback_kos_diff'] = int(p1_comeback_kos - p2_comeback_kos)

    common_statuses = ['brn','par','slp','frz','psn','tox']
    for status in common_statuses:
        out[f'tl_p1_inflicted_{status}_count'] = int(p1_inflicted_statuses.get(status,0))
        out[f'tl_p2_inflicted_{status}_count'] = int(p2_inflicted_statuses.get(status,0))
        out[f'tl_inflicted_{status}_diff'] = int(p1_inflicted_statuses.get(status,0) - p2_inflicted_statuses.get(status,0))

    common_move_types = ['normal','fire','water','electric','grass','psychic','ice','dragon','rock','ground','flying','ghost','bug','poison','fighting']
    for mt in common_move_types:
        out[f'tl_p1_move_type_{mt}_count'] = int(p1_move_type_counts.get(mt,0))
        out[f'tl_p2_move_type_{mt}_count'] = int(p2_move_type_counts.get(mt,0))
        out[f'tl_move_type_{mt}_count_diff'] = int(p1_move_type_counts.get(mt,0) - p2_move_type_counts.get(mt,0))

    out['tl_p1_damage_first2'] = float(p1_damage_first2)
    out['tl_p2_damage_first2'] = float(p2_damage_first2)
    out['tl_first2_damage_diff'] = float(p1_damage_first2 - p2_damage_first2)
    out['tl_turns_count'] = int(turns_count)
    out['tl_p1_moves_rate'] = float(p1_moves / turns_count)
    out['tl_p2_moves_rate'] = float(p2_moves / turns_count)
    out['tl_p1_damage_per_turn'] = float(p1_damage / turns_count)
    out['tl_p2_damage_per_turn'] = float(p2_damage / turns_count)
    out['tl_damage_rate_diff'] = float(out['tl_p1_damage_per_turn'] - out['tl_p2_damage_per_turn'])

    if seen_turns:
        recent_turns = sorted(seen_turns)[-5:]
        p1_last5 = sum(p1_dmg_by_turn.get(t,0.0) for t in recent_turns)
        p2_last5 = sum(p2_dmg_by_turn.get(t,0.0) for t in recent_turns)
    else:
        p1_last5 = p2_last5 = 0.0
    out['tl_p1_damage_last5'] = float(p1_last5)
    out['tl_p2_damage_last5'] = float(p2_last5)
    out['tl_last5_damage_diff'] = float(p1_last5 - p2_last5)
    out['tl_p1_last5_damage_ratio'] = float(p1_last5 / (p1_damage + 1e-6))
    out['tl_p2_last5_damage_ratio'] = float(p2_last5 / (p2_damage + 1e-6))
    out['tl_last5_damage_ratio_diff'] = float(out['tl_p1_last5_damage_ratio'] - out['tl_p2_last5_damage_ratio'])

    if seen_turns:
        ts = sorted(seen_turns)
        w = np.linspace(1.0, 2.0, num=len(ts))
        w = w / (w.sum() + 1e-9)
        adv = [(p1_dmg_by_turn.get(t,0.0) - p2_dmg_by_turn.get(t,0.0)) for t in ts]
        out['tl_weighted_damage_diff'] = float(np.dot(w, adv))
    else:
        out['tl_weighted_damage_diff'] = 0.0

    if seen_turns:
        ts = sorted(seen_turns)
        cum = 0.0
        signs = []
        for t in ts:
            cum += (p1_dmg_by_turn.get(t,0.0) - p2_dmg_by_turn.get(t,0.0))
            s = 1 if cum > 1e-9 else (-1 if cum < -1e-9 else 0)
            if s != 0:
                if not signs or signs[-1] != s:
                    signs.append(s)
        sign_flips = max(0, len(signs) - 1)
        comeback_flag = 1 if (len(signs) >= 2 and signs[0] != signs[-1]) else 0
    else:
        sign_flips = 0
        comeback_flag = 0
    out['tl_damage_adv_sign_flips'] = int(sign_flips)
    out['tl_comeback_flag'] = int(comeback_flag)

    out['tl_first_ko_turn_p1_inflicted'] = int(first_ko_turn_p1_inflicted or 0)
    out['tl_first_ko_turn_p1_taken'] = int(first_ko_turn_p1_taken or 0)
    out['tl_first_ko_turn_diff'] = int((first_ko_turn_p1_inflicted or 0) - (first_ko_turn_p1_taken or 0))
    out['tl_kos_early_p1'] = int(p1_kos_early)
    out['tl_kos_late_p1'] = int(p1_kos_late)
    out['tl_kos_early_p2'] = int(p2_kos_early)
    out['tl_kos_late_p2'] = int(p2_kos_late)

    for status in common_statuses:
        c1 = p1_inflicted_statuses.get(status,0)
        c2 = p2_inflicted_statuses.get(status,0)
        out[f'tl_p1_inflicted_{status}_rate'] = float(c1 / turns_count)
        out[f'tl_p2_inflicted_{status}_rate'] = float(c2 / turns_count)
        out[f'tl_inflicted_{status}_rate_diff'] = float((c1 - c2) / turns_count)

    return out

def ability_features(team: list, prefix: str) -> dict:
    immunity_abilities = {'levitate':0,'volt_absorb':0,'water_absorb':0,'flash_fire':0}
    stat_drop_abilities = {'intimidate':0}
    weather_abilities = {'drought':0,'drizzle':0,'sand_stream':0}
    out = {}
    for pokemon in team:
        ability = (pokemon.get('ability','') or '').lower().replace(' ','_')
        if ability in immunity_abilities:
            immunity_abilities[ability] += 1
        if ability in stat_drop_abilities:
            stat_drop_abilities[ability] += 1
        if ability in weather_abilities:
            weather_abilities[ability] += 1
    for ability,count in immunity_abilities.items():
        out[f'{prefix}ability_{ability}_count'] = int(count)
    for ability,count in stat_drop_abilities.items():
        out[f'{prefix}ability_{ability}_count'] = int(count)
    for ability,count in weather_abilities.items():
        out[f'{prefix}ability_{ability}_count'] = int(count)
    out[f'{prefix}total_immunity_abilities'] = int(sum(immunity_abilities.values()))
    out[f'{prefix}total_stat_drop_abilities'] = int(sum(stat_drop_abilities.values()))
    return out

def prepare_record_features(record: dict, max_turns: int = 30) -> dict:
    out = {}
    out['battle_id'] = record.get('battle_id')
    if 'player_won' in record:
        out['player_won'] = int(bool(record.get('player_won')))
    p1_team = record.get('p1_team_details', [])
    out.update(team_aggregate_features(p1_team, prefix='p1_'))
    p2_lead = record.get('p2_lead_details', {})
    out.update(lead_aggregate_features(p2_lead, prefix='p2_lead_'))
    out.update(ability_features(p1_team, prefix='p1_'))
    p1_lead = p1_team[0] if p1_team else {}
    out.update(lead_vs_lead_features(p1_lead, p2_lead))
    out.update(ability_features([p2_lead], prefix='p2_lead_'))
    out['p1_intimidate_vs_lead'] = 1 if out.get('p1_ability_intimidate_count',0) > 0 else 0
    tl = record.get('battle_timeline', [])
    out.update(summary_from_timeline(tl[:max_turns], p1_team))
    out['team_hp_sum_minus_p2lead_hp'] = out.get('p1_base_hp_sum', 0) - out.get('p2_lead_base_hp', 0)
    out['team_spa_mean_minus_p2spa'] = out.get('p1_base_spa_mean', 0) - out.get('p2_lead_base_spa', 0)
    out['speed_advantage'] = out.get('p1_base_spe_sum', 0) - out.get('p2_lead_base_spe', 0)
    out['n_unique_types_diff'] = out.get('p1_n_unique_types', 0) - out.get('p2_lead_n_unique_types', 1)
    p1_moves = max(out.get('tl_p1_moves',1),1)
    p2_moves = max(out.get('tl_p2_moves',1),1)
    out['damage_per_turn_diff'] = (out.get('tl_p1_est_damage',0.0)/p1_moves) - (out.get('tl_p2_est_damage',0.0)/p2_moves)
    out['last_pair'] = f"{out.get('tl_p1_last_active','')}_VS_{out.get('tl_p2_last_active','')}"
    out.update(calculate_type_advantage(p1_team, p2_lead))
    p2_lead_bulk = out.get('p2_lead_base_def',1) + out.get('p2_lead_base_spd',1)
    out['p1_se_options_vs_lead_bulk'] = out.get('p1_super_effective_options',0) / (p2_lead_bulk + 1e-6)
    p2_team = record.get('p2_team_details', [])
    if p2_team:
        out.update(team_aggregate_features(p2_team, prefix='p2_'))
        out['team_hp_sum_diff'] = out.get('p1_base_hp_sum',0) - out.get('p2_base_hp_sum',0)
        out['team_spa_mean_diff'] = out.get('p1_base_spa_mean',0) - out.get('p2_base_spa_mean',0)
        out['team_spe_mean_diff'] = out.get('p1_base_spe_mean',0) - out.get('p2_base_spe_mean',0)
        out['n_unique_types_team_diff'] = out.get('p1_n_unique_types',0) - out.get('p2_n_unique_types',0)
    return out

def create_features_from_raw(data: list) -> pd.DataFrame:
    rows = []
    for b in tqdm(data, desc='FE'):
        try:
            feat = prepare_record_features(b, max_turns=30)
            if 'battle_id' not in feat:
                feat['battle_id'] = b.get('battle_id')
            rows.append(feat)
        except Exception as e:
            rows.append({'battle_id': b.get('battle_id'), 'error': 1})
    df = pd.DataFrame(rows)
    if 'player_won' in df.columns:
        df['player_won'] = df['player_won'].astype(int)
    return df.fillna(0)

train_df = create_features_from_raw(train_raw)
test_df = create_features_from_raw(test_raw)
print('Feature shape train/test:', train_df.shape, test_df.shape)
display(train_df.head())

FE: 100%|██████████| 10000/10000 [00:10<00:00, 971.59it/s]
FE: 100%|██████████| 5000/5000 [00:05<00:00, 979.29it/s] 


Feature shape train/test: (10000, 236) (5000, 235)


Unnamed: 0,battle_id,player_won,p1_base_hp_sum,p1_base_hp_mean,p1_base_hp_max,p1_base_hp_min,p1_base_hp_std,p1_base_atk_sum,p1_base_atk_mean,p1_base_atk_max,...,team_hp_sum_minus_p2lead_hp,team_spa_mean_minus_p2spa,speed_advantage,n_unique_types_diff,damage_per_turn_diff,last_pair,p1_vs_lead_avg_effectiveness,p1_vs_lead_max_effectiveness,p1_super_effective_options,p1_se_options_vs_lead_bulk
0,0,1,695.0,115.833333,250.0,55.0,69.367179,435.0,72.5,110.0,...,635.0,0.0,365.0,3,-0.070393,starmie_VS_snorlax,1.083333,2.0,1,0.005405
1,1,1,740.0,123.333333,250.0,65.0,64.204534,435.0,72.5,110.0,...,685.0,-45.0,250.0,4,-0.012174,tauros_VS_alakazam,1.0,1.0,0,0.0
2,2,1,745.0,124.166667,250.0,60.0,64.382753,505.0,84.166667,130.0,...,495.0,-15.0,345.0,6,-0.00069,snorlax_VS_gengar,1.0,1.0,0,0.0
3,3,1,730.0,121.666667,250.0,60.0,65.362239,465.0,77.5,110.0,...,655.0,33.333333,345.0,6,-0.014574,snorlax_VS_zapdos,1.0,1.0,0,0.0
4,4,1,685.0,114.166667,250.0,50.0,70.794107,455.0,75.833333,110.0,...,625.0,-2.5,320.0,4,0.006923,tauros_VS_chansey,1.083333,2.0,1,0.005405


# Preprocessing

In [4]:
# Preprocessing - LightGBM gestisce bene valori raw (no scaling necessario)
exclude_cols = ['battle_id', 'player_won']
string_cols = train_df.select_dtypes(include=['object']).columns.tolist()
exclude_cols.extend(string_cols)

ALL_NUMERIC_FEATURES = [c for c in train_df.columns if c not in exclude_cols]
FEATURES = ALL_NUMERIC_FEATURES

print(f'Num FEATURES numeriche: {len(FEATURES)}')

# Imputazione mediana
num_df = train_df[FEATURES].astype(float).replace([np.inf, -np.inf], np.nan)
medians = num_df.median()
train_imputed = num_df.fillna(medians)
train_preproc_df = train_imputed.copy()

y = train_df['player_won'].astype(int).values
X = train_preproc_df.values

print('Preprocessing completato.')
print('Dataset completo size:', X.shape[0])
print('Features:', len(FEATURES))

# Allinea test
test_aligned = test_df.reindex(columns=FEATURES, fill_value=np.nan).astype(float).replace([np.inf, -np.inf], np.nan)
test_imputed = test_aligned.fillna(medians)
test_preproc_df = pd.DataFrame(test_imputed.values, columns=FEATURES, index=test_df.index)

Num FEATURES numeriche: 229
Preprocessing completato.
Dataset completo size: 10000
Features: 229


# Hyperparameter Optimization (Optuna + LightGBM)

In [None]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

print("="*70)
print("OPTUNA HYPERPARAMETER TUNING - LIGHTGBM (ACCURACY ONLY)")
print("="*70)

N_TRIALS = 100
cv_optuna = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective_lightgbm(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'seed': 42,
        
        # Learning parameters
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000, step=100),
        
        # Tree structure
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
        
        # Regularization
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        
        # Sampling
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        
        # Other
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'max_bin': trial.suggest_int('max_bin', 128, 512, step=64)
    }
    
    fold_accs = []
    fold_gaps = []
    
    for train_idx, val_idx in cv_optuna.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        # LightGBM con early stopping
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )
        
        # Validation accuracy
        y_pred_val = model.predict(X_val)
        val_acc = accuracy_score(y_val, y_pred_val)
        
        # Train accuracy (solo per logging, non usato nell'objective)
        y_pred_tr = model.predict(X_tr)
        tr_acc = accuracy_score(y_tr, y_pred_tr)
        
        gap = tr_acc - val_acc
        fold_accs.append(val_acc)
        fold_gaps.append(gap)
        
        # Pruning basato solo su accuracy
        trial.report(val_acc, len(fold_accs))
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    mean_val_acc = float(np.mean(fold_accs))
    mean_gap = float(np.mean(fold_gaps))
    std_val = float(np.std(fold_accs))
    
    # ✅ OTTIMIZZA SOLO L'ACCURACY - nessuna penalty
    objective_score = mean_val_acc
    
    # Logga metriche per analisi post-hoc (senza influenzare l'ottimizzazione)
    trial.set_user_attr("mean_val_acc", mean_val_acc)
    trial.set_user_attr("mean_gap", mean_gap)
    trial.set_user_attr("std_val", std_val)
    
    return float(objective_score)

# Ottimizzazione
print(f"Avvio {N_TRIALS} trial Optuna (ottimizzazione SOLO accuracy)...\n")

study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42, n_startup_trials=20),
    pruner=MedianPruner(n_startup_trials=15, n_warmup_steps=3)
)

study.optimize(objective_lightgbm, n_trials=N_TRIALS, timeout=7200, gc_after_trial=True)

# Report
print("\n" + "="*70)
print("OPTUNA RESULTS - LIGHTGBM (ACCURACY OPTIMIZATION)")
print("="*70)
print(f"Best accuracy: {study.best_value*100:.2f}%")
print(f"Best trial: #{study.best_trial.number}")

best_attrs = study.best_trial.user_attrs
best_val_acc = best_attrs.get('mean_val_acc', 0.0)
best_gap = best_attrs.get('mean_gap', 0.0)
best_std = best_attrs.get('std_val', 0.0)

print(f"\n📊 Best trial metrics:")
print(f"  Mean CV accuracy:    {best_val_acc*100:.2f}%")
print(f"  Mean train-val gap:  {best_gap*100:.2f}% (non ottimizzato)")
print(f"  Std CV accuracy:     {best_std*100:.2f}% (non ottimizzato)")

print(f"\n⚙️ Best hyperparameters:")
for k, v in study.best_params.items():
    if isinstance(v, float):
        print(f"  {k:25s}: {v:.6f}")
    else:
        print(f"  {k:25s}: {v}")

# Salva risultati - FIX: estrai user_attrs manualmente
trials_df = study.trials_dataframe()

# ✅ CORRETTO: Aggiungi colonne user_attrs manualmente dai trial
mean_val_accs = []
mean_gaps = []
std_vals = []

for trial in study.trials:
    mean_val_accs.append(trial.user_attrs.get('mean_val_acc', np.nan))
    mean_gaps.append(trial.user_attrs.get('mean_gap', np.nan))
    std_vals.append(trial.user_attrs.get('std_val', np.nan))

trials_df['mean_val_acc'] = mean_val_accs
trials_df['mean_gap'] = mean_gaps
trials_df['std_val'] = std_vals

trials_df = trials_df.sort_values('value', ascending=False)
trials_df.to_csv('optuna_lightgbm_trials.csv', index=False)
print(f"\n✅ Salvati {len(trials_df)} trial in 'optuna_lightgbm_trials.csv'")

# Analisi distribuzione gap (informativa, non ha influenzato l'ottimizzazione)
valid_trials = trials_df[trials_df['state'] == 'COMPLETE'].copy()
if len(valid_trials) > 0:
    gaps = valid_trials['mean_gap'].dropna()
    if len(gaps) > 0:
        print(f"\n📈 DISTRIBUZIONE GAP (informativa, non ottimizzata):")
        print(f"  Mean: {gaps.mean()*100:.2f}%")
        print(f"  Std:  {gaps.std()*100:.2f}%")
        print(f"  Min:  {gaps.min()*100:.2f}%")
        print(f"  Max:  {gaps.max()*100:.2f}%")
        print(f"  Best trial gap: {best_gap*100:.2f}%")

# Assegna best_params
best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
})

print(f"\n✅ Variabile 'best_params' pronta per CV")
print(f"⚠️  NOTA: Ottimizzato SOLO per accuracy. Controlla gap nella 10-fold CV!")

[I 2025-11-07 17:21:55,301] A new study created in memory with name: no-name-c06bea3a-d01c-4a0f-8683-0153ecebcb06


OPTUNA HYPERPARAMETER TUNING - LIGHTGBM (ACCURACY ONLY)
Avvio 100 trial Optuna (ottimizzazione SOLO accuracy)...



[I 2025-11-07 17:22:03,108] Trial 0 finished with value: 0.8225 and parameters: {'learning_rate': 0.02757359293934948, 'n_estimators': 2000, 'num_leaves': 115, 'max_depth': 8, 'min_child_samples': 24, 'min_child_weight': 0.004207053950287938, 'reg_alpha': 3.3323645788192616e-08, 'reg_lambda': 0.6245760287469893, 'subsample': 0.8005575058716043, 'colsample_bytree': 0.8540362888980227, 'subsample_freq': 1, 'min_split_gain': 0.9699098521619943, 'max_bin': 448}. Best is trial 0 with value: 0.8225.
[I 2025-11-07 17:22:10,243] Trial 1 finished with value: 0.8236999999999999 and parameters: {'learning_rate': 0.01777174904859463, 'n_estimators': 700, 'num_leaves': 44, 'max_depth': 6, 'min_child_samples': 57, 'min_child_weight': 0.05342937261279776, 'reg_alpha': 4.17890272377219e-06, 'reg_lambda': 0.0032112643094417484, 'subsample': 0.569746930326021, 'colsample_bytree': 0.6460723242676091, 'subsample_freq': 3, 'min_split_gain': 0.45606998421703593, 'max_bin': 448}. Best is trial 1 with value: 


OPTUNA RESULTS - LIGHTGBM (ACCURACY OPTIMIZATION)
Best accuracy: 82.70%
Best trial: #62

📊 Best trial metrics:
  Mean CV accuracy:    82.70%
  Mean train-val gap:  5.18% (non ottimizzato)
  Std CV accuracy:     0.53% (non ottimizzato)

⚙️ Best hyperparameters:
  learning_rate            : 0.048286
  n_estimators             : 600
  num_leaves               : 74
  max_depth                : 7
  min_child_samples        : 95
  min_child_weight         : 0.046044
  reg_alpha                : 0.001554
  reg_lambda               : 0.000040
  subsample                : 0.578947
  colsample_bytree         : 0.575072
  subsample_freq           : 2
  min_split_gain           : 0.741598
  max_bin                  : 320


KeyError: 'user_attrs'

# 10-Fold Cross-Validation

In [8]:
print("=== 10-Fold Cross-Validation - LightGBM ===")
best_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'learning_rate' : 0.048286,
    'n_estimators' : 600,
    'num_leaves' : 74,
    'max_depth' : 7,
    'min_child_samples' : 95,
    'min_child_weight' : 0.046044,
    'reg_alpha' : 0.001554,
    'reg_lambda' : 0.000040,
    'subsample' : 0.578947,
    'colsample_bytree' : 0.575072,
    'subsample_freq' : 2,
    'min_split_gain' : 0.741598,
    'max_bin' : 320
}

print(f"Parametri utilizzati: {best_params}\n")

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
outer_accuracies = []
train_accuracies = []
train_val_gaps = []
folds_info = []

fold_idx = 0
for train_idx, val_idx in skf.split(X, y):
    fold_idx += 1
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # Train LightGBM con early stopping
    clf = lgb.LGBMClassifier(**best_params)
    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    # Validation predictions
    y_pred_val = clf.predict(X_val)
    y_proba_val = clf.predict_proba(X_val)[:, 1]
    val_acc = accuracy_score(y_val, y_pred_val)
    outer_accuracies.append(val_acc)
    
    # Train accuracy
    y_pred_tr = clf.predict(X_tr)
    tr_acc = accuracy_score(y_tr, y_pred_tr)
    gap = tr_acc - val_acc
    train_accuracies.append(tr_acc)
    train_val_gaps.append(gap)
    
    # Best iteration info
    best_iter = clf.best_iteration_ if hasattr(clf, 'best_iteration_') else clf.n_estimators
    
    folds_info.append({
        'fold': fold_idx,
        'val_acc': float(val_acc),
        'train_acc': float(tr_acc),
        'gap': float(gap),
        'best_iteration': int(best_iter),
        'val_idx': val_idx,
        'y_true': y_val.astype(int),
        'y_pred': y_pred_val.astype(int),
        'y_proba': y_proba_val.astype(float)
    })
    
    print(f'Fold {fold_idx}: train={len(y_tr)}, val={len(y_val)}, '
          f'val_acc={val_acc*100:.2f}%, train_acc={tr_acc*100:.2f}%, '
          f'gap={gap*100:.2f}%, best_iter={best_iter}')

print('\n' + '='*60)
print('Risultati Cross-Validation')
print('='*60)
for i, info in enumerate(folds_info, 1):
    print(f"  Fold {i}: val_acc={info['val_acc']*100:.2f}%, "
          f"train_acc={info['train_acc']*100:.2f}%, gap={info['gap']*100:.2f}%")

print(f'\nMean CV accuracy: {np.mean(outer_accuracies)*100:.2f}%')
print(f'Std CV accuracy:  {np.std(outer_accuracies)*100:.2f}%')
print(f'Mean train accuracy: {np.mean(train_accuracies)*100:.2f}%')
print(f'Mean gap (train - val): {np.mean(train_val_gaps)*100:.2f}%')
print(f'Min/Max val acc:  {np.min(outer_accuracies)*100:.2f}% / {np.max(outer_accuracies)*100:.2f}%')

worst_idx = int(np.argmin(outer_accuracies))
print(f"\nPeggiore fold: #{worst_idx+1} con val_acc={outer_accuracies[worst_idx]*100:.2f}%")

=== 10-Fold Cross-Validation - LightGBM ===
Parametri utilizzati: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'verbosity': -1, 'seed': 42, 'learning_rate': 0.048286, 'n_estimators': 600, 'num_leaves': 74, 'max_depth': 7, 'min_child_samples': 95, 'min_child_weight': 0.046044, 'reg_alpha': 0.001554, 'reg_lambda': 4e-05, 'subsample': 0.578947, 'colsample_bytree': 0.575072, 'subsample_freq': 2, 'min_split_gain': 0.741598, 'max_bin': 320}

Fold 1: train=9000, val=1000, val_acc=81.30%, train_acc=86.64%, gap=5.34%, best_iter=152
Fold 1: train=9000, val=1000, val_acc=81.30%, train_acc=86.64%, gap=5.34%, best_iter=152
Fold 2: train=9000, val=1000, val_acc=84.10%, train_acc=88.18%, gap=4.08%, best_iter=204
Fold 2: train=9000, val=1000, val_acc=84.10%, train_acc=88.18%, gap=4.08%, best_iter=204
Fold 3: train=9000, val=1000, val_acc=82.60%, train_acc=87.89%, gap=5.29%, best_iter=193
Fold 3: train=9000, val=1000, val_acc=82.60%, train_acc=87.89%, gap=5.29%, best_ite

# Feature Importance Analysis

In [None]:
# Analizza feature importance dal modello finale
print("=== Feature Importance Analysis ===")

# Train su tutto il dataset per feature importance
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X, y)

# Estrai importances
importances = final_model.feature_importances_
importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': importances
}).sort_values('importance', ascending=False)

# Salva top features
importance_df.to_csv('lightgbm_feature_importances.csv', index=False)
print(f"✅ Salvate {len(importance_df)} feature importances in 'lightgbm_feature_importances.csv'")

print(f"\nTop 20 features più importanti:")
display(importance_df.head(20))

# Plot (opzionale, se matplotlib disponibile)
try:
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 8))
    top_n = 30
    top_features = importance_df.head(top_n)
    plt.barh(range(top_n), top_features['importance'].values)
    plt.yticks(range(top_n), top_features['feature'].values)
    plt.xlabel('Importance')
    plt.title(f'Top {top_n} Feature Importances - LightGBM')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('lightgbm_feature_importance.png', dpi=150)
    print(f"✅ Plot salvato in 'lightgbm_feature_importance.png'")
except Exception as e:
    print(f"⚠️ Plot non disponibile: {e}")

# Make Submission

In [9]:
print("=== Submission con LightGBM trainato su tutto il dataset ===")

# Train finale su tutto il dataset
submission_model = lgb.LGBMClassifier(**best_params)
submission_model.fit(X, y)

# Predict su test
X_test_matrix = test_preproc_df.values
test_predictions = submission_model.predict(X_test_matrix).astype(int)

# Crea submission
submission_df = pd.DataFrame({
    'battle_id': test_df['battle_id'].astype(np.int64),
    'player_won': test_predictions.astype(np.int64)
})

submission_path = 'submission_lightgbm.csv'
submission_df.to_csv(submission_path, index=False)

print(f"✅ File di submission salvato in {submission_path}")
print(f"Modello: LightGBM trainato su {len(X)} samples")
print(f"Stima CV accuracy: {np.mean(outer_accuracies)*100:.2f}% ± {np.std(outer_accuracies)*100:.2f}%")
print(f"Mean gap: {np.mean(train_val_gaps)*100:.2f}%")
print("\nPreview submission:")
display(submission_df.head(10))
print(f"\nDistribuzione predizioni: {submission_df['player_won'].value_counts().to_dict()}")

=== Submission con LightGBM trainato su tutto il dataset ===
✅ File di submission salvato in submission_lightgbm.csv
Modello: LightGBM trainato su 10000 samples
Stima CV accuracy: 82.45% ± 1.00%
Mean gap: 5.41%

Preview submission:
✅ File di submission salvato in submission_lightgbm.csv
Modello: LightGBM trainato su 10000 samples
Stima CV accuracy: 82.45% ± 1.00%
Mean gap: 5.41%

Preview submission:


Unnamed: 0,battle_id,player_won
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
5,5,0
6,6,1
7,7,1
8,8,1
9,9,1



Distribuzione predizioni: {1: 2502, 0: 2498}


# Model Comparison (LightGBM vs XGBoost)

In [None]:
# Confronto con risultati XGBoost (se disponibili)
print("=== Confronto LightGBM vs XGBoost ===\n")

# LightGBM results
lgb_cv_mean = np.mean(outer_accuracies)
lgb_cv_std = np.std(outer_accuracies)
lgb_gap_mean = np.mean(train_val_gaps)

print(f"LightGBM:")
print(f"  CV Accuracy: {lgb_cv_mean*100:.2f}% ± {lgb_cv_std*100:.2f}%")
print(f"  Mean Gap:    {lgb_gap_mean*100:.2f}%")
print(f"  Features:    {len(FEATURES)}")

# XGBoost baseline (da aggiornare con risultati reali)
xgb_cv_mean = 0.8138  # esempio - aggiorna con risultati effettivi
xgb_gap_mean = 0.0419  # esempio - aggiorna con risultati effettivi

print(f"\nXGBoost (baseline):")
print(f"  CV Accuracy: {xgb_cv_mean*100:.2f}%")
print(f"  Mean Gap:    {xgb_gap_mean*100:.2f}%")

# Difference
diff_acc = (lgb_cv_mean - xgb_cv_mean) * 100
diff_gap = (lgb_gap_mean - xgb_gap_mean) * 100

print(f"\n{'='*50}")
print(f"Differenza (LightGBM - XGBoost):")
print(f"  Accuracy: {diff_acc:+.2f}%")
print(f"  Gap:      {diff_gap:+.2f}%")

if diff_acc > 0.3:
    print(f"\n✅ LightGBM migliore → Usa submission_lightgbm.csv")
elif diff_acc < -0.3:
    print(f"\n⚠️ XGBoost migliore → Usa submission XGBoost")
else:
    print(f"\n➡️ Performance simili → Considera ensemble")