In [None]:
import json
import pandas as pd
import numpy as np
import os
import importlib
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


#reload del modulo caricato la prima volta, per prendere la versione aggiornata
import utils.type_effects as type_effects
importlib.reload(type_effects)
from utils.type_effects import TYPE_EFFECTIVENESS, type_advantage
import utils.model as model
importlib.reload(model)
from utils.model import run_pca, show_graph

In [None]:
# --- Define the path to our data ---
COMPETITION_NAME = 'fds-pokemon-battles-prediction-2025'
DATA_PATH = os.path.join('../input', COMPETITION_NAME)

train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
test_file_path = os.path.join(DATA_PATH, 'test.jsonl')
train_data = []

# Read the file line by line
print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))

    print(f"Successfully loaded {len(train_data)} battles.")

    # Let's inspect the first battle to see its structure
    print("\n--- Structure of the first train battle: ---")
    if train_data:
        first_battle = train_data[0]
        
        # To keep the output clean, we can create a copy and truncate the timeline
        battle_for_display = first_battle.copy()
        battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:2] # Show first 2 turns
        
        # Use json.dumps for pretty-printing the dictionary
        print(json.dumps(battle_for_display, indent=4))
        if len(first_battle.get('battle_timeline', [])) > 3:
            print("    ...")
            print("    (battle_timeline has been truncated for display)")


except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print("Please make sure you have added the competition data to this notebook.")

In [None]:
def create_simple_features(data: list[dict]) -> pd.DataFrame:
    feature_list = []
    #p1_bad_status_advantage = []
    status_change_diff = []
    for battle in tqdm(data, desc="Extracting features"):
        
        features = {}
        
        # --- Player 1 Team Features ---
        p1_mean_hp = p1_mean_spe = p1_mean_atk = p1_mean_def = p1_mean_sp = 0.0
        p1_lead_hp = p1_lead_spe = p1_lead_atk = p1_lead_def = p1_lead_sp = 0.0

        p1_team = battle.get('p1_team_details', [])
        if p1_team:

            p1_mean_hp = np.mean([p.get('base_hp', 0) for p in p1_team])
            p1_mean_spe = np.mean([p.get('base_spe', 0) for p in p1_team])
            p1_mean_atk = np.mean([p.get('base_atk', 0) for p in p1_team])
            p1_mean_def = np.mean([p.get('base_def', 0) for p in p1_team])
            p1_mean_sp = np.mean([p.get('base_spd', 0) for p in p1_team])

            features['p1_mean_hp'] = p1_mean_hp
            features['p1_mean_spe'] = p1_mean_spe
            features['p1_mean_atk'] = p1_mean_atk
            features['p1_mean_def'] = p1_mean_def
            features['p1_mean_sp'] = p1_mean_sp

            #PER UN CONFRONTO EQUO UTILIZZIAMO SOLO DATI DEL LEADER ANCHE NELLA SQUADRA 1 PER LE DIFFERENZE            
            p1_lead_hp =  p1_team[0].get('base_hp', 0)      
            p1_lead_spe = p1_team[0].get('base_spe', 0)  
            p1_lead_atk = p1_team[0].get('base_atk', 0)  
            p1_lead_def = p1_team[0].get('base_def', 0)  
            p1_lead_sp =  p1_team[0].get('base_spd', 0)  



        # --- Player 2 Lead Features ---
        p2_hp = p2_spe = p2_atk = p2_def = p2_sp= 0.0
        p2_lead = battle.get('p2_lead_details')
        if p2_lead:
            # Player 2's lead Pok√©mon's stats
            p2_hp = p2_lead.get('base_hp', 0)
            p2_spe = p2_lead.get('base_spe', 0)
            p2_atk = p2_lead.get('base_atk', 0)
            p2_def = p2_lead.get('base_def', 0)
            p2_sp = p2_lead.get('base_spd', 0)
            


        # I ADD THE DIFFS/DELTAS
        features['diff_hp']  = p1_lead_hp  - p2_hp
        features['diff_spe'] = p1_lead_spe - p2_spe
        features['diff_atk'] = p1_lead_atk - p2_atk
        features['diff_def'] = p1_lead_def - p2_def
        features['diff_sp'] =  p1_lead_sp - p2_sp
        


        #DYNAMIC INFO
        #Chi mantiene pi√π HP medi e conduce pi√π turni,  nella maggior parte dei casi vince anche se la battaglia non √® ancora finita
        timeline = battle.get('battle_timeline', [])
        if timeline:
            #SALUTE
            p1_hp = [t['p1_pokemon_state']['hp_pct'] for t in timeline if t.get('p1_pokemon_state')]
            p2_hp = [t['p2_pokemon_state']['hp_pct'] for t in timeline if t.get('p2_pokemon_state')]
            #salute media dei pokemon del primo giocatore
            #features['p1_mean_hp_pct'] = np.mean(p1_hp)
            #salute media dei pokemon del secondo giocatore ATTENZIONE FEATURE BUONE CORRELATE CON hp_diff_mean 75%,VALUTAZIONE DELL'EEFFETTO SU BASE SINGOLA (CON HP DIFF)
            #features['p2_mean_hp_pct'] = np.mean(p2_hp)
            #vantaggio medio in salute (media della differenza tra la salute dei pokemon del primo giocatore e quella dei pokemon del secondo giocatore)
            features['hp_diff_mean'] = np.mean(np.array(p1_hp) - np.array(p2_hp))
            #percentuale di tempo in vantaggio (ovvero media dei booleani che indicano il vantaggio => proporzione del vantaggio)
            features['p1_hp_advantage_mean'] = np.mean(np.array(p1_hp) > np.array(p2_hp))#GRAN BELLA OPZIONE DI CLASSIFICAZIONE POSSIBILE APPLICAZIONE DI EFFETTI DI ETEROGENEITA



            #SUM OF FINAL HP PERCENTAGE OF EACH PLAYER
            p1_hp_final ={}
            p2_hp_final ={}
            for t in timeline:
                if t.get('p1_pokemon_state'):
                    p1_hp_final[t['p1_pokemon_state']['name']]=t['p1_pokemon_state']['hp_pct']
                if t.get('p2_pokemon_state'):
                    p2_hp_final[t['p2_pokemon_state']['name']]=t['p2_pokemon_state']['hp_pct']
            #print(p1_hp_final)
            #numero di pockemon usati dal giocatore nei primi 30 turni
            features['p1_n_pokemon_use'] =len(p1_hp_final.keys())
            features['p2_n_pokemon_use'] =len(p2_hp_final.keys())
            #differenza nello schieramento pockemon dopo 30 turni
            features['diff_final_schieramento']=features['p1_n_pokemon_use']-features['p2_n_pokemon_use']
            nr_pokemon_sconfitti_p1 = np.sum([1 for e in list(p1_hp_final.values()) if e==0])
            nr_pokemon_sconfitti_p2 = np.sum([1 for e in list(p2_hp_final.values()) if e==0])
            features['nr_pokemon_sconfitti_p1'] = nr_pokemon_sconfitti_p1
            features['nr_pokemon_sconfitti_p2'] = nr_pokemon_sconfitti_p2
            #features['nr_pokemon_sconfitti_diff'] = nr_pokemon_sconfitti_p1-nr_pokemon_sconfitti_p2
            #DOVREBBERO ESSERE BOMBA VITA DELLE DUE SQUADRE DOPO I 30 TURNI
            features['p1_pct_final_hp'] =np.sum(list(p1_hp_final.values()))+(6-len(p1_hp_final.keys()))
            features['p2_pct_final_hp'] =np.sum(list(p2_hp_final.values()))+(6-len(p1_hp_final.keys()))
            #SAREBBE CLAMOROSO NORMALIZZARLA ANCHE IN BASE ALLA DIFFERENZA DI VITA ASSOLUTA DEI POCKEMON LEADER DEI 2 PLAYER
            features['diff_final_hp']=features['p1_pct_final_hp']-features['p2_pct_final_hp'] 









            #vedo anche come la salute media evolve nel tempo
            phases = 3 #early, mid, late game
            nr_turns = 30 #numero turni
            slice_idx = nr_turns // phases #slice index must be integer
            #print("slice_idx: ",slice_idx, "len p1_hp: ",len(p1_hp))
            features['early_hp_mean_diff'] = np.mean(np.array(p1_hp[:slice_idx]) - np.array(p2_hp[:slice_idx]))
            features['late_hp_mean_diff'] = np.mean(np.array(p1_hp[-slice_idx:]) - np.array(p2_hp[-slice_idx:]))
            #features['phases_hp_mean_diff'] = features['late_hp_mean_diff'] - features['early_hp_mean_diff']
            #77.94% (+/- 0.35%) => 77.94% (+/- 0.41%)
            hp_delta = np.array(p1_hp) - np.array(p2_hp)
            features['hp_delta_trend'] = np.polyfit(range(len(hp_delta)), hp_delta, 1)[0]
            
            #fluttuazioni negli hp (andamento della partita: stabile o molto caotica)
            #77.94% (+/- 0.41%) => 79.09% (+/- 1.02%)
            features['p1_hp_std'] = np.std(p1_hp)
            features['p2_hp_std'] = np.std(p2_hp)
            features['hp_delta_std'] = np.std(hp_delta)

            
            ##STATUS (default nostatus, gli altri sono considerati negativi - i boost sono positivi)
            p1_status = [t['p1_pokemon_state'].get('status', 'nostatus') for t in timeline if t.get('p1_pokemon_state')]
            p2_status = [t['p2_pokemon_state'].get('status', 'nostatus') for t in timeline if t.get('p2_pokemon_state')]
            total_status = set(p1_status + p2_status)
            no_effect_status = {'nostatus', 'noeffect'}
            negative_status = {s for s in total_status if s not in no_effect_status}
            #mean of negative status
            p1_negative_status_mean = np.mean([s in negative_status for s in p1_status])
            p2_negative_status_mean = np.mean([s in negative_status for s in p2_status])
            #status advantage if p1 applied more status to p2 (differenza delle medie dei negativi)
            features['p1_bad_status_advantage'] = p2_negative_status_mean-p1_negative_status_mean
            #p1_bad_status_advantage.append(features['p1_bad_status_advantage'])
            #how many times status changed? 
            # we have to check that first array shifted by 1 is 
            # different from the same array excluding the last element 
            # (so basically checking if status change in time)
            #somma il nr di volte in cui lo stato cambia, vedi se collineare
            p1_status_change = np.sum(np.array(p1_status[1:]) != np.array(p1_status[:-1]))
            p2_status_change = np.sum(np.array(p2_status[1:]) != np.array(p2_status[:-1]))
            #features['p1_status_change'] = p1_status_change
            #features['p2_status_change'] = p2_status_change
            features['status_change_diff'] = p1_status_change - p2_status_change
            status_change_diff.append(features['status_change_diff'])
            
            #QUANTO IL TEAM √® BILANCIATO (TIPI E VELOCITA)
            #79.09% (+/- 1.02%) => 79.29% (+/- 0.92%)
            p1_types = [t for p in p1_team for t in p.get('types', []) if t != 'notype']
            features['p1_type_diversity'] = len(set(p1_types))

            MEDIUM_SPEED_THRESHOLD = 90 #medium-speed pokemon
            HIGH_SPEED_THRESHOLD = 100 #fast pokemon
            speeds = np.array([p.get('base_spe', 0) for p in p1_team])
            features['p1_avg_speed_stat_battaglia'] = np.mean(np.array(speeds) > MEDIUM_SPEED_THRESHOLD)
            features['p1_avg_high_speed_stat_battaglia'] = np.mean(np.array(speeds) > HIGH_SPEED_THRESHOLD)


            #COMBINAZIONI DI FEATURE
            #combino vantaggio negli hp con l'avere pochi status negativi
            #79.09% (+/- 1.02%) => 79.13% (+/- 1.06%)
            #features['hp_advantage_no_negative_status'] = features['hp_delta_trend'] * (1 - p1_negative_status_mean)
            #LA FEATURE √® BELLA MA SUPER CORRELATA CON hp_delta_trend 95%
            #per il momento semplifico cosi capiamo poi √® facile aggiungere

        # We also need the ID and the target variable (if it exists)
        features['battle_id'] = battle.get('battle_id')
        if 'player_won' in battle:
            features['player_won'] = int(battle['player_won'])
            
        feature_list.append(features)

    return pd.DataFrame(feature_list).fillna(0)




# Create feature DataFrames for both training and test sets
print("Processing training data...")
train_df = create_simple_features(train_data)

print("\nProcessing test data...")
test_data = []
with open(test_file_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
test_df = create_simple_features(test_data)

print("\nTraining features preview:")
display(train_df.head())
print("\nForma del dataset",train_df.shape)

#phases_hp
#hp_delta_trend

SCALING

In [None]:
# Create scaler
scaler = StandardScaler()

# Select numerical feature columns (exclude 'player_won')
feature_columns = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]
X = train_df[feature_columns]
X_train = scaler.fit_transform(X)

train_df_scaled = pd.DataFrame(X_train, columns=feature_columns)
train_df_scaled['battle_id'] = train_df['battle_id']
train_df_scaled['player_won'] = train_df['player_won']



RISTRUTTURAZIONE E VISONE DEL DATASET

In [None]:
train_df_scaled.shape#shape mean that data is like matrix
train_df_scaled.info()#description of variable

print(train_df_scaled['diff_final_hp'].describe(include='all'))
print(train_df_scaled['p1_pct_final_hp'].describe(include='all'))
print(train_df_scaled['p2_pct_final_hp'].describe(include='all'))



In [None]:
#ATTENZIONE LANCIA SOLO 1 VOLTA ALTRIMENTI CONTINUA A TOGLIERE COLONNE DAL DATASET !!!! PERICOLOSO
#MA NON SO COME FARE
print(train_df_scaled.iloc[:,0:5].head())
pca_p1, pca_model_p1=run_pca(train_df_scaled.iloc[:,0:5])
print(pca_model_p1.explained_variance_ratio_)#show the behavior of variance absorbation
print(pca_model_p1.components_)#rinomino variabile come, competitivita, vita vs chi attacca prima, speed,attack vs defence
pca_p1.columns = ['p1_mean_competivness', 'p1_mean_hp_vs_start','p1_mean_start_vs_speed','p1_mean_atk_vs_def']


In [None]:
print(train_df_scaled.iloc[:,5:10].head())
pca_diff, pca_model_diff=run_pca(train_df_scaled.iloc[:,5:10])
print(pca_model_diff.explained_variance_ratio_)#show the behavior of variance absorbation
print(pca_model_diff.components_)#rinomino variabile come, competitivita, vita vs chi attacca prima, speed,attack vs defence
pca_diff.columns = ['diff_mean_competivness', 'diff_mean_hp_vs_start','diff_mean_start_vs_speed','diff_mean_atk_vs_def']


In [None]:
#RICOMPONI IL DATASET INCORRELATO
support=train_df_scaled.iloc[:,10:]
train_df_scaled=pd.concat([pca_p1,pca_diff, support], axis=1)
print(train_df_scaled.shape)#corretta ricostruzione del dataset verificata
print(train_df_scaled.columns)

In [None]:
train_df_scaled.describe()
corr_matrix = train_df_scaled.corr(numeric_only=True).round(1)
print(corr_matrix.shape)
pd.set_option('display.max_columns', None)   # mostra tutte le colonne
pd.set_option('display.max_rows', None)      # mostra tutte le righe
pd.set_option('display.width', 0)            # evita line wrapping
pd.set_option('display.max_colwidth', None)  # mostra nomi completi delle colonne


#attenzione mostra solo colonne tanto correlate nel dataset
# 1Ô∏è‚É£ Compute correlation matrix and filter strong correlations
mask_strong = (corr_matrix.abs() > 0.5) & (corr_matrix.abs() < 1.0)
strong_features = corr_matrix.columns[mask_strong.any()].tolist()
subset_corr = corr_matrix.loc[strong_features, strong_features]

# 2Ô∏è‚É£ Dynamically adjust figure size based on number of features
n = len(subset_corr.columns)
fig_width = max(12, n * 0.6)
fig_height = max(10, n * 0.6)

# 3Ô∏è‚É£ Plot with larger figure, higher DPI, and better layout
plt.figure(figsize=(fig_width, fig_height), dpi=150)
sns.heatmap(
    subset_corr,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    square=True,
    linewidths=0.5,        # small grid lines for separation
    cbar_kws={'shrink': 0.8}
)

plt.title("Strong Feature Correlations (|r| > 0.5)", fontsize=18, pad=20)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
#working with standardize we give more weight to variable that have absolute value in start smaller than other, example diff variable
# Define features and target
features = [col for col in train_df_scaled.columns if col not in ['battle_id', 'player_won']]
X = train_df_scaled[features]
print(X.shape)
y = train_df_scaled['player_won']

### STUDIAMO GRAFICAMENTE FEATURES E OUTPUT E TRA FEATURE

In [None]:
"""
print(train_df['player_won'].value_counts())#victory classes ar perfectly balance
#best case for accuracy calcolation 
#show_graph(train_df_scaled[['p1_mean_competivness','battle_id']],train_df_scaled[['player_won']])
show_graph(train_df_scaled[['diff_final_hp','diff_mean_competivness']],train_df_scaled[['player_won']])
"""

In [None]:
"""
print(train_df_scaled.columns)
#si cominciano ad intravedere effettive distribuzioni bivariate con eterogeneita tra gruppi di vincita e perdita
show_graph(train_df_scaled[['diff_final_schieramento','diff_final_hp']],train_df_scaled[['player_won']])
"""

### PolynomialFeatures 
crea nuove feature come potenze e relazioni tra le feature numeriche originali, per vedere relazioni non lineari; il modello cattura curvature e relazioni mantenendo la linearit√† nei parametri => purtroppo aggiunge $\binom{n+d}{n}$ feature (n=numero feature originali e d=degree=2 o altro intero) e aumenta il tempo di addestramento => valuta se usare una PCA per gestire l'esplosione dimensionale.

Usalo solo se le feature originali sono informative

### TRAIN AND SUBMIT

In [None]:
"""
# Initialize and train the model
print("Training a simple Logistic Regression model...")
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
print("Model training complete.")

# Evaluate on validation set
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
"""
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

# PCA?
USE_PCA = False
POLY_ENABLED = False# se enabled 77.64% (+/- 0.69%) altrimenti 77.94% (+/- 0.35%)

steps = []
if POLY_ENABLED:
    steps.append(("poly", PolynomialFeatures(degree=2, include_bias=False)))
#standardizza
steps.append(("scaler", StandardScaler()))
if USE_PCA:
    steps.append(("pca", PCA(n_components=0.95, svd_solver="full")))  # ~95% varianza

steps.append(("logreg", LogisticRegression(max_iter=2000, random_state=42)))

pipe = Pipeline(steps)

#kfold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold CV
print("Training Logistic Regression con 5-Fold Cross-Validation...\n")
scores = cross_val_score(pipe, X, y, cv=kfold, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation accuracies: {np.round(scores, 4)}")
print(f"Mean CV accuracy: {np.mean(scores)*100:.2f}% (+/- {np.std(scores)*100:.2f}%)")

#Training finale
pipe.fit(X, y)
print("\nFinal model trained on all training data.")
#sm.logisticModel(tran_df[feate1,feature2,feature2**2,fature1*feature2])

### SUBMIT

In [None]:
# Make predictions on the real test data
X_test = test_df[features]
print("Generating predictions on the test set...")
test_predictions = pipe.predict(X_test)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'battle_id': test_df['battle_id'],
    'player_won': test_predictions
})

# Save submission CSV
submission_df.to_csv('submission.csv', index=False)
print("\n'submission.csv' file created successfully!")
display(submission_df.head())

In [11]:
from scipy.stats import linregress
import json
import pandas as pd
import numpy as np
import os
import importlib
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


#reload del modulo caricato la prima volta, per prendere la versione aggiornata
import utils.type_effects as type_effects
importlib.reload(type_effects)
#from utils.type_effects import TYPE_EFFECTIVENESS, type_advantage
import utils.model as model
importlib.reload(model)
from utils.model import run_pca, show_graph
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
# --- Define the path to our data ---
COMPETITION_NAME = 'fds-pokemon-battles-prediction-2025'
DATA_PATH = os.path.join('../input', COMPETITION_NAME)

train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
test_file_path = os.path.join(DATA_PATH, 'test.jsonl')
train_data = []

# Read the file line by line
print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))

    print(f"Successfully loaded {len(train_data)} battles.")

    # Let's inspect the first battle to see its structure
    print("\n--- Structure of the first train battle: ---")
    if train_data:
        first_battle = train_data[0]
        
        # To keep the output clean, we can create a copy and truncate the timeline
        battle_for_display = first_battle.copy()
        battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:2] # Show first 2 turns
        
        # Use json.dumps for pretty-printing the dictionary
        print(json.dumps(battle_for_display, indent=4))
        if len(first_battle.get('battle_timeline', [])) > 3:
            print("    ...")
            print("    (battle_timeline has been truncated for display)")


except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print("Please make sure you have added the competition data to this notebook.")
# Simplified Pok√©mon type effectiveness chart
# Values: 2.0 = super effective, 0.5 = not very effective, 0.0 = no effect
type_chart = {
    "Normal":     {"Rock":0.5, "Ghost":0.0, "Steel":0.5},
    "Fire":       {"Fire":0.5, "Water":0.5, "Grass":2.0, "Ice":2.0, "Bug":2.0, "Rock":0.5, "Dragon":0.5, "Steel":2.0},
    "Water":      {"Fire":2.0, "Water":0.5, "Grass":0.5, "Ground":2.0, "Rock":2.0, "Dragon":0.5},
    "Electric":   {"Water":2.0, "Electric":0.5, "Grass":0.5, "Ground":0.0, "Flying":2.0, "Dragon":0.5},
    "Grass":      {"Fire":0.5, "Water":2.0, "Grass":0.5, "Poison":0.5, "Ground":2.0, "Flying":0.5, "Bug":0.5, "Rock":2.0, "Dragon":0.5, "Steel":0.5},
    "Ice":        {"Fire":0.5, "Water":0.5, "Grass":2.0, "Ground":2.0, "Flying":2.0, "Dragon":2.0, "Steel":0.5},
    "Fighting":   {"Normal":2.0, "Ice":2.0, "Rock":2.0, "Dark":2.0, "Steel":2.0, "Poison":0.5, "Flying":0.5, "Psychic":0.5, "Bug":0.5, "Ghost":0.0, "Fairy":0.5},
    "Poison":     {"Grass":2.0, "Poison":0.5, "Ground":0.5, "Rock":0.5, "Ghost":0.5, "Steel":0.0, "Fairy":2.0},
    "Ground":     {"Fire":2.0, "Electric":2.0, "Grass":0.5, "Poison":2.0, "Flying":0.0, "Bug":0.5, "Rock":2.0, "Steel":2.0},
    "Flying":     {"Electric":0.5, "Grass":2.0, "Fighting":2.0, "Bug":2.0, "Rock":0.5, "Steel":0.5},
    "Psychic":    {"Fighting":2.0, "Poison":2.0, "Psychic":0.5, "Dark":0.0, "Steel":0.5},
    "Bug":        {"Fire":0.5, "Grass":2.0, "Fighting":0.5, "Poison":0.5, "Flying":0.5, "Psychic":2.0, "Ghost":0.5, "Dark":2.0, "Steel":0.5, "Fairy":0.5},
    "Rock":       {"Fire":2.0, "Ice":2.0, "Fighting":0.5, "Ground":0.5, "Flying":2.0, "Bug":2.0, "Steel":0.5},
    "Ghost":      {"Normal":0.0, "Psychic":2.0, "Ghost":2.0, "Dark":0.5},
    "Dragon":     {"Dragon":2.0, "Steel":0.5, "Fairy":0.0},
    "Dark":       {"Fighting":0.5, "Psychic":2.0, "Ghost":2.0, "Fairy":0.5},
    "Steel":      {"Fire":0.5, "Water":0.5, "Electric":0.5, "Ice":2.0, "Rock":2.0, "Fairy":2.0, "Steel":0.5},
    "Fairy":      {"Fire":0.5, "Fighting":2.0, "Poison":0.5, "Dragon":2.0, "Dark":2.0, "Steel":0.5}
}

#0 Mean CV accuracy: 83.59% (+/- 0.64%)
#1 Mean CV accuracy: 83.62% (+/- 0.67%)
def hp_advantage_trend(battle):
    """Compute the linear slope of Player 1's HP advantage over 30 turns."""
    hp_adv = []
    for turn in battle['battle_timeline']:
        p1_hp = turn['p1_pokemon_state']['hp_pct']
        p2_hp = turn['p2_pokemon_state']['hp_pct']
        hp_adv.append(p1_hp - p2_hp)
    x = np.arange(len(hp_adv))
    slope, _, _, _, _ = linregress(x, hp_adv)
    return slope

#2 Mean CV accuracy: 83.55% (+/- 0.63%)
"""
A small decrease (‚âà 0.05 %) means:
The new status_duration_ratio feature adds noise but not independent signal.
The effect of bad statuses is already encoded in your existing features ‚Äî especially p1_bad_status_advantage and status_change_diff, both of which have strong correlations (¬±0.5 ‚Äì 0.6) with the target.
Logistic Regression, being linear, can‚Äôt easily ‚Äúdisentangle‚Äù two highly collinear features, so adding another version of the same concept just shifts weights slightly.
"""
def status_duration_ratio(battle):
    p1_turns_bad = sum(turn['p1_pokemon_state']['status'] != 'nostatus' for turn in battle['battle_timeline'])
    p2_turns_bad = sum(turn['p2_pokemon_state']['status'] != 'nostatus' for turn in battle['battle_timeline'])
    return (p2_turns_bad + 1) / (p1_turns_bad + 1)
#3 Pre-battle info
#Mean CV accuracy: 83.59% (+/- 0.64%)
"""
Your feature space is already highly saturated with the key linear signals ‚Äî the Logistic Regression can‚Äôt extract more because:
Many of your engineered variables (HP diffs, status advantages, competitiveness metrics) are strongly correlated with one another.
Logistic Regression can only assign one weight per direction of correlation, so redundant inputs don‚Äôt add discriminative power.
You‚Äôre probably sitting near the model‚Äôs linear ceiling for this dataset (~83‚Äì84%).
This is good news ‚Äî it means your data cleaning and baseline engineering are solid.
Now, the final 2‚Äì3% (to reach 86%) likely requires non-linear relationships or interaction modeling.
"""
def team_stat_variance(team):
    """Average variance of base stats within Player 1's team."""
    stats = ['base_hp', 'base_atk', 'base_def', 'base_spa', 'base_spd', 'base_spe']
    # Compute variance of each stat across the 6 Pok√©mon, then take the mean
    return float(np.mean([np.var([p[s] for p in team]) for s in stats]))

def team_type_diversity(team):
    """Proportion of distinct Pok√©mon types in the team."""
    all_types = [t for p in team for t in p['types'] if t != 'notype']
    if not all_types:
        return 0.0
    return len(set(all_types)) / len(all_types)

def lead_stat_diff(p1_team, p2_lead):
    """Average stat difference between Player 1's team mean and Player 2's lead Pok√©mon."""
    stats = ['base_hp', 'base_atk', 'base_def', 'base_spa', 'base_spd', 'base_spe']
    p1_mean = np.mean([[p[s] for s in stats] for p in p1_team], axis=0)
    p2_stats = np.array([p2_lead[s] for s in stats])
    return float(np.mean(p1_mean - p2_stats))

#5 p1_move_damage_mean, p2_move_damage_mean, diff_move_damage_mean
def move_damage_efficiency(battle):
    moves = [t.get('p1_move_details', {}) for t in battle['battle_timeline'] if t.get('p1_move_details')]
    if not moves: return 0
    total_damage = sum(m.get('damage', 0) for m in moves)
    return total_damage / len(moves)  # avg damage per move
#6 p1_first_strike_ratio, tempo_advantage = first_strike_ratio - 0.5
"""
Cross-validation accuracies: [0.844  0.827  0.831  0.8375 0.835 ]
Mean CV accuracy: 83.49% (+/- 0.58%)

1. Logistic regression doesn‚Äôt automatically benefit from new features
Even though new features (like p1_move_damage_mean or tempo_advantage) sound informative,
your model is a linear classifier with L1 penalty.
If the new variables:
are correlated with existing ones (hp_diff_mean, diff_final_hp, etc.),
or noisy (e.g. move damage varying randomly per battle),
then logistic regression‚Äôs L1 regularization will zero them out,
or worse ‚Äî slightly destabilize weights and hurt generalization by adding variance.
That‚Äôs why performance went from 83.52 ‚Üí 83.49% ‚Äî statistically flat, but slightly worse.
‚öôÔ∏è 2. These specific features overlap conceptually with existing ones
New Feature	Likely Redundant With	Explanation
p1_move_damage_mean, diff_move_damage_mean	diff_final_hp, hp_diff_mean	Both describe how much damage one side deals over time.
p1_first_strike_ratio, tempo_advantage	hp_delta_trend	Both encode ‚Äúinitiative‚Äù: who tends to lead or inflict earlier HP drops.
So they add little new orthogonal signal ‚Äî linear models can‚Äôt combine correlated predictors effectively.
üß© 3. Why it doesn‚Äôt mean they‚Äôre useless
These features might be valuable for:
nonlinear models (Random Forest, Gradient Boosting, XGBoost)
interaction terms (e.g. PolynomialFeatures, cross terms)
or temporal clustering if used with per-turn modeling.
But in your current setup (L1 LogisticRegressionCV), adding correlated variables = minor penalty.
"""
def first_strike_ratio(battle):
    timeline = battle.get("battle_timeline", [])
    if len(timeline) < 2:
        return 0.0

    p1_hp = [t["p1_pokemon_state"]["hp_pct"] for t in timeline if t.get("p1_pokemon_state")]
    p2_hp = [t["p2_pokemon_state"]["hp_pct"] for t in timeline if t.get("p2_pokemon_state")]

    # approximate: if p2_hp decreases before p1_hp does, p1 attacked first
    p1_first_hits = sum((p2_hp[i] - p2_hp[i+1]) > (p1_hp[i] - p1_hp[i+1]) for i in range(len(p1_hp)-1))
    ratio = p1_first_hits / (len(p1_hp) - 1)
    return ratio

#7 hp_momentum_flips, hp_flip_rate = flips / len(timeline)
"""perf drop
Cross-validation accuracies: [0.841  0.8285 0.832  0.8385 0.8355]
Mean CV accuracy: 83.51% (+/- 0.45%)
hp_momentum_flips is a temporal volatility feature ‚Äî it counts how many times the HP advantage flips between the two players.
It measures ‚Äúmomentum chaos‚Äù: stable battles (few flips) vs. back-and-forth ones (many flips).

Why performance drops when adding more features
You‚Äôre already near the linear ceiling.
Logistic regression is extracting essentially all the linear signal your data has ‚Äî roughly 83.6 ¬± 0.6 % looks like your model‚Äôs upper bound.
Additional features that are noisy or weakly correlated tend to only add variance, not information.
New features like hp_momentum_flips and hp_flip_rate are non-linear effects.
They‚Äôre based on the sequence dynamics of battles, which a linear model can‚Äôt capture well unless the relationship with player_won is monotonic.
Logistic regression assumes a smooth, single-direction effect ‚Äî but volatility in battles isn‚Äôt monotonic:
Some flips = competitive battle (either could win)
Too few flips = complete dominance (p1 wins or loses early)
That kind of U-shape is invisible to linear models.
L1 regularization doesn‚Äôt help if new variables are weakly correlated.
Even though the L1 penalty tries to zero out irrelevant features, slight noise in the cross-validation folds can make the coefficients dance a bit ‚Üí small accuracy drop.
‚öôÔ∏è What this means for you
‚úÖ Your base feature set is solid ‚Äî most meaningful information is already encoded in HP, final differences, and status metrics.
‚ö†Ô∏è New dynamic features (momentum, tempo, move damage) will help only if you switch to:
tree-based models (RandomForest, XGBoost, LightGBM), or
polynomial / interaction expansion of selected features.
"""
def hp_momentum_flips(battle):
    timeline = battle.get("battle_timeline", [])
    if len(timeline) < 2:
        return 0.0
    hp_adv = np.array([
        t.get("p1_pokemon_state", {}).get("hp_pct", 0) -
        t.get("p2_pokemon_state", {}).get("hp_pct", 0)
        for t in timeline
    ])
    # Count how many times the advantage sign changes
    flips = np.sum(np.sign(hp_adv[1:]) != np.sign(hp_adv[:-1]))
    return float(flips)

#8 p1_lead_type_advantage, p2_lead_type_advantage, diff_type_advantage
def type_effectiveness(attacker_types, defender_types):
    """Compute average type effectiveness multiplier between attacker and defender."""
    if not attacker_types or not defender_types:
        return 1.0  # neutral if missing

    values = []
    for a in attacker_types:
        if a not in type_chart:
            continue
        for d in defender_types:
            values.append(type_chart[a].get(d, 1.0))  # default to neutral (1.0)
    return np.mean(values) if values else 1.0

#9 battle_duration, hp_loss_rate = diff_final_hp / battle_duration
def battle_duration(battle):
    return len([t for t in battle['battle_timeline'] if t['p1_pokemon_state']['hp_pct'] > 0 and
                                                     t['p2_pokemon_state']['hp_pct'] > 0])
#10 p1_team_imbalance, p2_team_imbalance, diff_team_imbalance
"""
Why battle_duration and hp_loss_rate helped
These two features add time-normalized efficiency, which Logistic Regression can model linearly:
Feature	Meaning	Why it helps
battle_duration	How long both sides were active	Distinguishes between decisive vs drawn-out matches
hp_loss_rate	HP difference per turn	Captures speed of dominance ‚Äî short + large HP gap = strong linear win signal
They add orthogonal information to your HP and final-state metrics (diff_final_hp, hp_delta_trend), improving linear separability.
"""
def team_stat_imbalance(team):
    totals = [p['base_hp']+p['base_atk']+p['base_def']+p['base_spa']+p['base_spd']+p['base_spe'] for p in team]
    return np.std(totals) / np.mean(totals)
"""
attack_to_speed_ratio = diff_atk / (diff_spe + 1e-6)
hp_control_ratio = hp_diff_mean / hp_delta_std
status_pressure = status_change_diff / (hp_momentum_flips + 1)
"""
def create_simple_features(data: list[dict]) -> pd.DataFrame:
    feature_list = []
    #p1_bad_status_advantage = []
    status_change_diff = []
    for battle in tqdm(data, desc="Extracting features"):

        features = {}

        # --- Player 1 Team Features ---
        p1_mean_hp = p1_mean_spe = p1_mean_atk = p1_mean_def = p1_mean_sp = 0.0
        p1_lead_hp = p1_lead_spe = p1_lead_atk = p1_lead_def = p1_lead_sp = 0.0

        p1_team = battle.get('p1_team_details', [])
        if p1_team:

            p1_mean_hp = np.mean([p.get('base_hp', 0) for p in p1_team])
            p1_mean_spe = np.mean([p.get('base_spe', 0) for p in p1_team])
            p1_mean_atk = np.mean([p.get('base_atk', 0) for p in p1_team])
            p1_mean_def = np.mean([p.get('base_def', 0) for p in p1_team])
            p1_mean_sp = np.mean([p.get('base_spd', 0) for p in p1_team])

            features['p1_mean_hp'] = p1_mean_hp
            features['p1_mean_spe'] = p1_mean_spe
            features['p1_mean_atk'] = p1_mean_atk
            features['p1_mean_def'] = p1_mean_def
            features['p1_mean_sp'] = p1_mean_sp

            #PER UN CONFRONTO EQUO UTILIZZIAMO SOLO DATI DEL LEADER ANCHE NELLA SQUADRA 1 PER LE DIFFERENZE
            p1_lead_hp =  p1_team[0].get('base_hp', 0)
            p1_lead_spe = p1_team[0].get('base_spe', 0)
            p1_lead_atk = p1_team[0].get('base_atk', 0)
            p1_lead_def = p1_team[0].get('base_def', 0)
            p1_lead_sp =  p1_team[0].get('base_spd', 0)



        # --- Player 2 Lead Features ---
        p2_hp = p2_spe = p2_atk = p2_def = p2_sp= 0.0
        p2_lead = battle.get('p2_lead_details')
        if p2_lead:
            # Player 2's lead Pok√©mon's stats
            p2_hp = p2_lead.get('base_hp', 0)
            p2_spe = p2_lead.get('base_spe', 0)
            p2_atk = p2_lead.get('base_atk', 0)
            p2_def = p2_lead.get('base_def', 0)
            p2_sp = p2_lead.get('base_spd', 0)



        # I ADD THE DIFFS/DELTAS
        features['diff_hp']  = p1_lead_hp  - p2_hp
        features['diff_spe'] = p1_lead_spe - p2_spe
        features['diff_atk'] = p1_lead_atk - p2_atk
        features['diff_def'] = p1_lead_def - p2_def
        features['diff_sp'] =  p1_lead_sp - p2_sp

        #8 new
        """
        # --- Type advantage (lead vs. lead) ---
        p1_lead_types = [t for t in p1_team[0].get("types", []) if t != "notype"] if p1_team else []
        p2_lead_types = [t for t in p2_lead.get("types", []) if t != "notype"] if p2_lead else []

        features["p1_lead_type_advantage"] = type_effectiveness(p1_lead_types, p2_lead_types)
        features["p2_lead_type_advantage"] = type_effectiveness(p2_lead_types, p1_lead_types)
        features["diff_type_advantage"] = (
            features["p1_lead_type_advantage"] - features["p2_lead_type_advantage"]
        )
        """
        #new2
        #features['status_duration_ratio'] = status_duration_ratio(battle)

        #new3 - static feat
        """
        features['p1_team_stat_variance'] = team_stat_variance(battle['p1_team_details'])
        features['p1_type_diversity_ratio'] = team_type_diversity(battle['p1_team_details'])
        features['lead_stat_diff'] = lead_stat_diff(battle['p1_team_details'], battle['p2_lead_details'])
        """
        #DYNAMIC INFO
        #Chi mantiene pi√π HP medi e conduce pi√π turni,  nella maggior parte dei casi vince anche se la battaglia non √® ancora finita
        timeline = battle.get('battle_timeline', [])
        if timeline:
            #SALUTE
            p1_hp = [t['p1_pokemon_state']['hp_pct'] for t in timeline if t.get('p1_pokemon_state')]
            p2_hp = [t['p2_pokemon_state']['hp_pct'] for t in timeline if t.get('p2_pokemon_state')]
            #salute media dei pokemon del primo giocatore
            #features['p1_mean_hp_pct'] = np.mean(p1_hp)
            #salute media dei pokemon del secondo giocatore ATTENZIONE FEATURE BUONE CORRELATE CON hp_diff_mean 75%,VALUTAZIONE DELL'EEFFETTO SU BASE SINGOLA (CON HP DIFF)
            #features['p2_mean_hp_pct'] = np.mean(p2_hp)
            #vantaggio medio in salute (media della differenza tra la salute dei pokemon del primo giocatore e quella dei pokemon del secondo giocatore)
            features['hp_diff_mean'] = np.mean(np.array(p1_hp) - np.array(p2_hp))
            # TEST5V--- MOVE DAMAGE EFFICIENCY (uses helper) ---
            features['p1_move_damage_mean'] = move_damage_efficiency({
                'battle_timeline': [
                    t for t in timeline if t.get('p1_move_details')
                ]
            })
            features['p2_move_damage_mean'] = move_damage_efficiency({
                'battle_timeline': [
                    t for t in timeline if t.get('p2_move_details')
                ]
            })
            features['diff_move_damage_mean'] = (
                features['p1_move_damage_mean'] - features['p2_move_damage_mean']
            )
            """6 (performance drop)
            # --- FIRST STRIKE RATIO / TEMPO ADVANTAGE (uses helper) ---
            features['p1_first_strike_ratio'] = first_strike_ratio(battle)
            features['tempo_advantage'] = features['p1_first_strike_ratio'] - 0.5
            """
            #FINE TEST5V
            """
            # TEST5--- MOVE DAMAGE EFFICIENCY ---
            # Compute average damage per move for each player
            p1_moves = [t.get('p1_move_details', {}) for t in timeline if t.get('p1_move_details')]
            p2_moves = [t.get('p2_move_details', {}) for t in timeline if t.get('p2_move_details')]

            if p1_moves:
                p1_total_damage = sum(m.get('damage', 0) for m in p1_moves)
                features['p1_move_damage_mean'] = p1_total_damage / len(p1_moves)
            else:
                features['p1_move_damage_mean'] = 0.0

            if p2_moves:
                p2_total_damage = sum(m.get('damage', 0) for m in p2_moves)
                features['p2_move_damage_mean'] = p2_total_damage / len(p2_moves)
            else:
                features['p2_move_damage_mean'] = 0.0

            # Relative advantage: difference in mean damage dealt per move
            features['diff_move_damage_mean'] = (
                features['p1_move_damage_mean'] - features['p2_move_damage_mean']
            )
            #FINE  TEST5
            """

            #percentuale di tempo in vantaggio (ovvero media dei booleani che indicano il vantaggio => proporzione del vantaggio)
            features['p1_hp_advantage_mean'] = np.mean(np.array(p1_hp) > np.array(p2_hp))#GRAN BELLA OPZIONE DI CLASSIFICAZIONE POSSIBILE APPLICAZIONE DI EFFETTI DI ETEROGENEITA



            #SUM OF FINAL HP PERCENTAGE OF EACH PLAYER
            p1_hp_final ={}
            p2_hp_final ={}
            for t in timeline:
                if t.get('p1_pokemon_state'):
                    p1_hp_final[t['p1_pokemon_state']['name']]=t['p1_pokemon_state']['hp_pct']
                if t.get('p2_pokemon_state'):
                    p2_hp_final[t['p2_pokemon_state']['name']]=t['p2_pokemon_state']['hp_pct']
            #print(p1_hp_final)
            #numero di pockemon usati dal giocatore nei primi 30 turni
            features['p1_n_pokemon_use'] =len(p1_hp_final.keys())
            features['p2_n_pokemon_use'] =len(p2_hp_final.keys())
            #differenza nello schieramento pockemon dopo 30 turni
            features['diff_final_schieramento']=features['p1_n_pokemon_use']-features['p2_n_pokemon_use']
            nr_pokemon_sconfitti_p1 = np.sum([1 for e in list(p1_hp_final.values()) if e==0])
            nr_pokemon_sconfitti_p2 = np.sum([1 for e in list(p2_hp_final.values()) if e==0])
            features['nr_pokemon_sconfitti_p1'] = nr_pokemon_sconfitti_p1
            features['nr_pokemon_sconfitti_p2'] = nr_pokemon_sconfitti_p2
            #features['nr_pokemon_sconfitti_diff'] = nr_pokemon_sconfitti_p1-nr_pokemon_sconfitti_p2
            #DOVREBBERO ESSERE BOMBA VITA DELLE DUE SQUADRE DOPO I 30 TURNI
            features['p1_pct_final_hp'] =np.sum(list(p1_hp_final.values()))+(6-len(p1_hp_final.keys()))
            features['p2_pct_final_hp'] =np.sum(list(p2_hp_final.values()))+(6-len(p1_hp_final.keys()))
            #SAREBBE CLAMOROSO NORMALIZZARLA ANCHE IN BASE ALLA DIFFERENZA DI VITA ASSOLUTA DEI POCKEMON LEADER DEI 2 PLAYER
            features['diff_final_hp']=features['p1_pct_final_hp']-features['p2_pct_final_hp']

            #9 new
            # --- Battle duration and HP loss rate ---
            try:
                dur = battle_duration(battle)
            except Exception:
                dur = 0
            features["battle_duration"] = dur
            features["hp_loss_rate"] = (
                features["diff_final_hp"] / dur if dur > 0 else 0.0
            )

            







            #vedo anche come la salute media evolve nel tempo
            phases = 3 #early, mid, late game
            nr_turns = 30 #numero turni
            slice_idx = nr_turns // phases #slice index must be integer
            #print("slice_idx: ",slice_idx, "len p1_hp: ",len(p1_hp))
            features['early_hp_mean_diff'] = np.mean(np.array(p1_hp[:slice_idx]) - np.array(p2_hp[:slice_idx]))
            features['late_hp_mean_diff'] = np.mean(np.array(p1_hp[-slice_idx:]) - np.array(p2_hp[-slice_idx:]))
            
            
            

            #features['phases_hp_mean_diff'] = features['late_hp_mean_diff'] - features['early_hp_mean_diff']
            #77.94% (+/- 0.35%) => 77.94% (+/- 0.41%)
            hp_delta = np.array(p1_hp) - np.array(p2_hp)
            features['hp_delta_trend'] = np.polyfit(range(len(hp_delta)), hp_delta, 1)[0]
            #new
            features['hp_advantage_trend'] = hp_advantage_trend(battle)
            #6 new --- HP momentum (number of times advantage flips) ---
            """
            features['hp_momentum_flips'] = hp_momentum_flips(battle)
            features['hp_flip_rate'] = features['hp_momentum_flips'] / max(1, len(timeline))
            """
            #fluttuazioni negli hp (andamento della partita: stabile o molto caotica)
            #77.94% (+/- 0.41%) => 79.09% (+/- 1.02%)
            features['p1_hp_std'] = np.std(p1_hp)
            features['p2_hp_std'] = np.std(p2_hp)
            features['hp_delta_std'] = np.std(hp_delta)

            #9.2 new
            # --- Efficiency & stability metrics (#10) ---
            # 1Ô∏è‚É£ Damage per turn: how much advantage P1 gains in HP per turn
            features["damage_per_turn"] = (
                features["diff_move_damage_mean"] / max(1, features["battle_duration"])
            )
            """
            # 2Ô∏è‚É£ Early lead ratio: does early advantage translate into final dominance
            features["early_lead_ratio"] = (
                features["early_hp_mean_diff"] / (abs(features["diff_final_hp"]) + 1e-6)
            )

            # 3Ô∏è‚É£ HP stability ratio: volatility normalized by duration
            features["hp_stability_ratio"] = (
                features["hp_delta_std"] / max(1, features["battle_duration"])
            )
            """
            #fine 9.2

            ##STATUS (default nostatus, gli altri sono considerati negativi - i boost sono positivi)
            p1_status = [t['p1_pokemon_state'].get('status', 'nostatus') for t in timeline if t.get('p1_pokemon_state')]
            p2_status = [t['p2_pokemon_state'].get('status', 'nostatus') for t in timeline if t.get('p2_pokemon_state')]
            total_status = set(p1_status + p2_status)
            no_effect_status = {'nostatus', 'noeffect'}
            negative_status = {s for s in total_status if s not in no_effect_status}
            #mean of negative status
            p1_negative_status_mean = np.mean([s in negative_status for s in p1_status])
            p2_negative_status_mean = np.mean([s in negative_status for s in p2_status])
            #status advantage if p1 applied more status to p2 (differenza delle medie dei negativi)
            features['p1_bad_status_advantage'] = p2_negative_status_mean-p1_negative_status_mean
            #p1_bad_status_advantage.append(features['p1_bad_status_advantage'])
            #how many times status changed?
            # we have to check that first array shifted by 1 is
            # different from the same array excluding the last element
            # (so basically checking if status change in time)
            #somma il nr di volte in cui lo stato cambia, vedi se collineare
            p1_status_change = np.sum(np.array(p1_status[1:]) != np.array(p1_status[:-1]))
            p2_status_change = np.sum(np.array(p2_status[1:]) != np.array(p2_status[:-1]))
            #features['p1_status_change'] = p1_status_change
            #features['p2_status_change'] = p2_status_change
            features['status_change_diff'] = p1_status_change - p2_status_change
            status_change_diff.append(features['status_change_diff'])

            #QUANTO IL TEAM √® BILANCIATO (TIPI E VELOCITA)
            #79.09% (+/- 1.02%) => 79.29% (+/- 0.92%)
            p1_types = [t for p in p1_team for t in p.get('types', []) if t != 'notype']
            features['p1_type_diversity'] = len(set(p1_types))

            MEDIUM_SPEED_THRESHOLD = 90 #medium-speed pokemon
            HIGH_SPEED_THRESHOLD = 100 #fast pokemon
            speeds = np.array([p.get('base_spe', 0) for p in p1_team])
            features['p1_avg_speed_stat_battaglia'] = np.mean(np.array(speeds) > MEDIUM_SPEED_THRESHOLD)
            features['p1_avg_high_speed_stat_battaglia'] = np.mean(np.array(speeds) > HIGH_SPEED_THRESHOLD)


            #COMBINAZIONI DI FEATURE
            #combino vantaggio negli hp con l'avere pochi status negativi
            #79.09% (+/- 1.02%) => 79.13% (+/- 1.06%)
            #features['hp_advantage_no_negative_status'] = features['hp_delta_trend'] * (1 - p1_negative_status_mean)
            #LA FEATURE √® BELLA MA SUPER CORRELATA CON hp_delta_trend 95%
            #per il momento semplifico cosi capiamo poi √® facile aggiungere

        # We also need the ID and the target variable (if it exists)
        features['battle_id'] = battle.get('battle_id')
        if 'player_won' in battle:
            features['player_won'] = int(battle['player_won'])

        feature_list.append(features)

    return pd.DataFrame(feature_list).fillna(0)




# Create feature DataFrames for both training and test sets
print("Processing training data...")
train_df = create_simple_features(train_data)

print("\nProcessing test data...")
test_data = []
with open(test_file_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
test_df = create_simple_features(test_data)

print("\nTraining features preview:")
display(train_df.head())
print("\nForma del dataset",train_df.shape)

######UNICA CELLA?step2
# Create scaler
scaler = StandardScaler()

# Select numerical feature columns (exclude 'player_won')
feature_columns = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]
X = train_df[feature_columns]
X_train = scaler.fit_transform(X)

train_df_scaled = pd.DataFrame(X_train, columns=feature_columns)
train_df_scaled['battle_id'] = train_df['battle_id']
train_df_scaled['player_won'] = train_df['player_won']


#step3
#working with standardize we give more weight to variable that have absolute value in start smaller than other, example diff variable
# Define features and target
features = [col for col in train_df_scaled.columns if col not in ['battle_id', 'player_won']]
X = train_df_scaled[features]
print(X.shape)
y = train_df_scaled['player_won']
#step4
import time
start = time.perf_counter()
"""
# Initialize and train the model
print("Training a simple Logistic Regression model...")
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
print("Model training complete.")

# Evaluate on validation set
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
"""


# Add just above your pipeline
USE_L1_SELECTION = True

# PCA?
USE_PCA = False
POLY_ENABLED = False#False#True se enabled 77.64% (+/- 0.69%) altrimenti 77.94% (+/- 0.35%)
steps = []



"""
interaction_features = [
    "hp_diff_mean",
    "p1_pct_final_hp",
    "p1_bad_status_advantage",
    "status_change_diff",
    "diff_final_schieramento",
    "p1_hp_advantage_mean"
]

# Polynomial transformer only on selected columns
poly = ColumnTransformer(
    transformers=[
        ("poly_subset", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False), interaction_features)
    ],
    remainder='passthrough'  # keep all other features unchanged
)


if POLY_ENABLED:
    steps.append(("poly_subset", poly))
"""
if POLY_ENABLED:
    steps.append(("poly", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)))

#standardizza
steps.append(("scaler", StandardScaler()))
if USE_PCA:
    steps.append(("pca", PCA(n_components=0.95, svd_solver="full")))  # ~95% varianza

#steps.append(("logreg", LogisticRegression(max_iter=2000, random_state=42)))
"""
Cross-validation accuracies: [0.845  0.829  0.832  0.8405 0.8355]
Mean CV accuracy: 83.64% (+/- 0.58%)

a modest but real improvement (‚âà +0.05 %) and, more importantly, reduced variance across folds (¬± 0.58 % ‚Üí better generalization).
That confirms two things:
Your feature set is strong ‚Äî the model is already extracting nearly all available linear signal.
L1 regularization is cleaning up minor redundancy, giving you a slightly more stable model.
"""
Cs_grid = [0.01, 0.03, 0.08,0.1,0.2, 0.3, 1, 3, 10]
if USE_L1_SELECTION:
    logreg = LogisticRegressionCV(
        Cs=Cs_grid,#20,
        cv=5,
        penalty="l1",
        solver="liblinear",
        scoring="accuracy",
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    )
else:
    logreg = LogisticRegression(
        max_iter=2000,
        random_state=42,
        C=1.0,
        penalty="l2",
        solver="liblinear"
    )

steps.append(("logreg", logreg))
pipe = Pipeline(steps)

#kfold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold CV
print("Training Logistic Regression con 5-Fold Cross-Validation...\n")
#print(X)
X = train_df_scaled[features]
print(X.shape)

"""
scores = cross_val_score(pipe, X, y, cv=kfold, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation accuracies: {np.round(scores, 4)}")
print(f"Mean CV accuracy: {np.mean(scores)*100:.2f}% (+/- {np.std(scores)*100:.2f}%)")

#Training finale
pipe.fit(X, y)
print("\nFinal model trained on all training data.")
print(logreg.C_ if logreg.C_ else "not found")
#sm.logisticModel(tran_df[feate1,feature2,feature2**2,fature1*feature2])
"""

features = [col for col in train_df_scaled.columns if col not in ['battle_id', 'player_won']]
print(features)



def sequential_forward_selection(pipe, X, y, kfold, max_features=10):
    remaining_features = [col for col in train_df_scaled.columns if col not in ['battle_id', 'player_won']]#list(X.columns)
    selected_features = []
    best_scores = []
    current_best_score = 0

    print(f"\nüîç Sequential Forward Selection (up to {max_features} features)\n")

    for step in range(max_features):
        step_results = []

        # Try adding each remaining feature and evaluate CV accuracy
        for f in remaining_features:
            candidate_features = selected_features + [f]
            X_subset = X[candidate_features]
            scores = cross_val_score(pipe, X_subset, y, cv=kfold, scoring='accuracy', n_jobs=-1)
            mean_acc = np.mean(scores)
            step_results.append((f, mean_acc))

        # Pick the best new feature
        best_feature, best_score = max(step_results, key=lambda x: x[1])
        improvement = best_score - current_best_score

        print(f"Step {step+1}: added '{best_feature}' ‚Üí mean CV: {best_score*100:.2f}% (Œî={improvement*100:.2f}%)")

        # Update selections
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
        best_scores.append(best_score)
        current_best_score = best_score

    # Return final results
    results_df = pd.DataFrame({
        "n_features": range(1, len(best_scores)+1),
        "cv_accuracy": np.array(best_scores)*100
    })
    return selected_features, results_df
Greedy = False
Tree = False#True
if Greedy:
  selected, results_df = sequential_forward_selection(pipe, train_df_scaled, y, kfold, max_features=10)

  print("\n‚úÖ Selected features in order of addition:")
  print(selected)

  print("\nüìà Accuracy progression:")
  print(results_df)
elif Tree:
  scaler = StandardScaler()
  X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
  from sklearn.ensemble import RandomForestClassifier

  rf = RandomForestClassifier(
      n_estimators=200,
      max_depth=5,
      random_state=42
  )
  rf.fit(X_scaled, y)
  importances = pd.Series(rf.feature_importances_, index=X_scaled.columns)
  top_rf = importances.sort_values(ascending=False).head(10)
  print([k for k in top_rf.keys()])
else:
  #candidate_features = ['status_change_diff', 'diff_final_schieramento']#, 'p1_pct_final_hp', 'p1_bad_status_advantage', 'nr_pokemon_sconfitti_p1']
  #candidate_features = ['status_change_diff', 'diff_final_schieramento', 'p1_pct_final_hp', 'p1_bad_status_advantage', 'nr_pokemon_sconfitti_p1', 'p2_n_pokemon_use', 'diff_final_hp', 'hp_diff_mean', 'p1_hp_std', 'late_hp_mean_diff']
  #candidate_features = ['status_change_diff', 'diff_final_schieramento', 'p1_pct_final_hp', 'p1_bad_status_advantage', 'nr_pokemon_sconfitti_p1', 'p2_n_pokemon_use', 'diff_final_hp', 'hp_diff_mean', 'p1_hp_std', 'late_hp_mean_diff', 'p2_hp_std', 'p1_n_pokemon_use', 'p1_hp_advantage_mean', 'p2_pct_final_hp', 'early_hp_mean_diff']
  #tutte
  candidate_features = features
  X_subset = X[candidate_features]
  #scores = cross_val_score(pipe, X_subset, y, cv=kfold, scoring='accuracy', n_jobs=-1)
  #mean_acc = np.mean(scores)
  #print(candidate_features, scores, mean_acc)

  scores = cross_val_score(pipe, X_subset, y, cv=kfold, scoring='accuracy', n_jobs=-1)
  print(f"Cross-validation accuracies: {np.round(scores, 4)}")
  print(f"Mean CV accuracy: {np.mean(scores)*100:.2f}% (+/- {np.std(scores)*100:.2f}%)")
  print(f"candidate len: {len(candidate_features)}")
  print("Training finale")
  pipe.fit(X, y)
elapsed = time.perf_counter() - start
print(f"‚è±Ô∏è Elapsed time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")

#phases_hp
#hp_delta_trend

Loading data from '../input/fds-pokemon-battles-prediction-2025/train.jsonl'...
Successfully loaded 10000 battles.

--- Structure of the first train battle: ---
{
    "player_won": true,
    "p1_team_details": [
        {
            "name": "starmie",
            "level": 100,
            "types": [
                "psychic",
                "water"
            ],
            "base_hp": 60,
            "base_atk": 75,
            "base_def": 85,
            "base_spa": 100,
            "base_spd": 100,
            "base_spe": 115
        },
        {
            "name": "exeggutor",
            "level": 100,
            "types": [
                "grass",
                "psychic"
            ],
            "base_hp": 95,
            "base_atk": 95,
            "base_def": 85,
            "base_spa": 125,
            "base_spd": 125,
            "base_spe": 55
        },
        {
            "name": "chansey",
            "level": 100,
            "types": [
                "normal",

Extracting features:   0%|          | 0/10000 [00:00<?, ?it/s]


Processing test data...


Extracting features:   0%|          | 0/5000 [00:00<?, ?it/s]


Training features preview:


Unnamed: 0,p1_mean_hp,p1_mean_spe,p1_mean_atk,p1_mean_def,p1_mean_sp,diff_hp,diff_spe,diff_atk,diff_def,diff_sp,...,p2_hp_std,hp_delta_std,damage_per_turn,p1_bad_status_advantage,status_change_diff,p1_type_diversity,p1_avg_speed_stat_battaglia,p1_avg_high_speed_stat_battaglia,battle_id,player_won
0,115.833333,80.0,72.5,63.333333,100.0,0,0,0,0,0,...,0.280925,0.368726,0.0,0.333333,-1,4,0.5,0.5,0,1
1,123.333333,61.666667,72.5,65.833333,90.0,10,-25,0,-10,-40,...,0.241442,0.378163,0.0,-0.2,5,5,0.333333,0.166667,1,1
2,124.166667,65.833333,84.166667,71.666667,90.0,-155,5,90,80,20,...,0.212332,0.334522,0.0,-0.033333,6,7,0.333333,0.333333,2,1
3,121.666667,75.833333,77.5,65.833333,103.333333,-15,0,-35,-35,60,...,0.369359,0.421204,0.0,-0.5,4,7,0.5,0.333333,3,1
4,114.166667,72.5,75.833333,79.166667,97.5,-5,5,-25,-40,35,...,0.281241,0.348348,0.0,0.433333,-5,5,0.333333,0.333333,4,1



Forma del dataset (10000, 40)
(10000, 38)
Training Logistic Regression con 5-Fold Cross-Validation...

(10000, 38)
['p1_mean_hp', 'p1_mean_spe', 'p1_mean_atk', 'p1_mean_def', 'p1_mean_sp', 'diff_hp', 'diff_spe', 'diff_atk', 'diff_def', 'diff_sp', 'hp_diff_mean', 'p1_move_damage_mean', 'p2_move_damage_mean', 'diff_move_damage_mean', 'p1_hp_advantage_mean', 'p1_n_pokemon_use', 'p2_n_pokemon_use', 'diff_final_schieramento', 'nr_pokemon_sconfitti_p1', 'nr_pokemon_sconfitti_p2', 'p1_pct_final_hp', 'p2_pct_final_hp', 'diff_final_hp', 'battle_duration', 'hp_loss_rate', 'early_hp_mean_diff', 'late_hp_mean_diff', 'hp_delta_trend', 'hp_advantage_trend', 'p1_hp_std', 'p2_hp_std', 'hp_delta_std', 'damage_per_turn', 'p1_bad_status_advantage', 'status_change_diff', 'p1_type_diversity', 'p1_avg_speed_stat_battaglia', 'p1_avg_high_speed_stat_battaglia']
Cross-validation accuracies: [0.845  0.828  0.8375 0.841  0.8365]
Mean CV accuracy: 83.76% (+/- 0.57%)
candidate len: 38
Training finale
‚è±Ô∏è Elaps

In [12]:
# Make predictions on the real test data
X_test = test_df[features]
print("Generating predictions on the test set...")
test_predictions = pipe.predict(X_test)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'battle_id': test_df['battle_id'],
    'player_won': test_predictions
})

# Save submission CSV
submission_df.to_csv('submission.csv', index=False)
print("\n'submission.csv' file created successfully!")
display(submission_df.head())

Generating predictions on the test set...

'submission.csv' file created successfully!


Unnamed: 0,battle_id,player_won
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
