In [1]:
import json
import pandas as pd
import os

# --- Define the path to our data ---
COMPETITION_NAME = 'fds-pokemon-battles-prediction-2025'
DATA_PATH = os.path.join('../input', COMPETITION_NAME)

train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
test_file_path = os.path.join(DATA_PATH, 'test.jsonl')
train_data = []

# Read the file line by line
print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))

    print(f"Successfully loaded {len(train_data)} battles.")

    # Let's inspect the first battle to see its structure
    print("\n--- Structure of the first train battle: ---")
    if train_data:
        first_battle = train_data[0]
        
        # To keep the output clean, we can create a copy and truncate the timeline
        battle_for_display = first_battle.copy()
        battle_for_display['battle_timeline'] = battle_for_display.get('battle_timeline', [])[:2] # Show first 2 turns
        
        # Use json.dumps for pretty-printing the dictionary
        print(json.dumps(battle_for_display, indent=4))
        if len(first_battle.get('battle_timeline', [])) > 3:
            print("    ...")
            print("    (battle_timeline has been truncated for display)")


except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print("Please make sure you have added the competition data to this notebook.")

Loading data from '../input/fds-pokemon-battles-prediction-2025/train.jsonl'...
Successfully loaded 10000 battles.

--- Structure of the first train battle: ---
{
    "player_won": true,
    "p1_team_details": [
        {
            "name": "starmie",
            "level": 100,
            "types": [
                "psychic",
                "water"
            ],
            "base_hp": 60,
            "base_atk": 75,
            "base_def": 85,
            "base_spa": 100,
            "base_spd": 100,
            "base_spe": 115
        },
        {
            "name": "exeggutor",
            "level": 100,
            "types": [
                "grass",
                "psychic"
            ],
            "base_hp": 95,
            "base_atk": 95,
            "base_def": 85,
            "base_spa": 125,
            "base_spd": 125,
            "base_spe": 55
        },
        {
            "name": "chansey",
            "level": 100,
            "types": [
                "normal",

In [2]:
from tqdm.notebook import tqdm
import numpy as np

    # enemy_def = moves that deal 1/2x to other pokemons
    # enemy_atk = moves that deal 2x to other pokemons

def types_mult(team: dict) -> dict:
    # no_effect = moves that deal 0x to my pokemons
    # def_weak = moves that deal 2x to my pokemons
    # atk_weak = moves that deal 1/2x to my pokemons
    
    types_dict = {
        "normal": {"no_effect": ["ghost"], "def_weak": ["fighting"], "atk_weak": []}, 
        "fire":   {"no_effect": [], "def_weak": ["water", "ground", "rock"], "atk_weak": ["fire", "grass", "bug"]}
    }
    pass

def static_features(battle: dict) -> dict: 

    features = {}
    stats = ["hp", "spe", "atk", "def", "spd", "spa"]

    # --- Player 1 Team Features ---
    p1_team = battle.get('p1_team_details', [])
    if p1_team:
        # Average stats for p1 team
        for stat in stats:
            features[f'p1_mean_{stat}'] = np.mean([p.get(f'base_{stat}', 0) for p in p1_team])

        

    # --- Player 2 Lead Features ---
    p2_lead = battle.get('p2_lead_details')
    if p2_lead:
        # Stats for lead pokemon p2
        for stat in stats:
            features[f'p2_lead_{stat}'] = p2_lead.get(f'base_{stat}', 0)


    # --- Lead matchup (from first turn) ---
    battle_timeline = battle.get('battle_timeline', [])
    if p1_team and p2_lead and battle_timeline:
        first_turn = battle_timeline[0]
        p1_pokemon_name = first_turn.get('p1_pokemon_state', {}).get('name')

        # Find matching Pokémon in p1_team
        p1_pokemon = next((p for p in p1_team if p.get('name') == p1_pokemon_name), None)

        if p1_pokemon: 
            p1_spe = p1_pokemon.get('base_spe', 0)
            p2_spe = features['p2_lead_spe']
            features['spe_diff'] = p1_spe - p2_spe
        else: 
            features['spe_diff'] = 0.0

    return features

def dynamic_features(battle: dict) -> dict:
    pass

def create_simple_features(data: list[dict]) -> pd.DataFrame:
    """
    A very basic feature extraction function.
    It only uses the aggregated base stats of the player's team and opponent's lead.
    """
    feature_list = []
    for battle in tqdm(data, desc="Extracting features"):
        features = {}

        features.update(static_features(battle))

        # We also need the ID and the target variable (if it exists)
        features['battle_id'] = battle.get('battle_id')
        if 'player_won' in battle:
            features['player_won'] = int(battle['player_won'])
            
        feature_list.append(features)
        
    return pd.DataFrame(feature_list).fillna(0)

# Create feature DataFrames for both training and test sets
print("Processing training data...")
train_df = create_simple_features(train_data)

print("\nProcessing test data...")
test_data = []
with open(test_file_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
test_df = create_simple_features(test_data)

print("\nTraining features preview:")
display(train_df.head())

Processing training data...


Extracting features:   0%|          | 0/10000 [00:00<?, ?it/s]


Processing test data...


Extracting features:   0%|          | 0/5000 [00:00<?, ?it/s]


Training features preview:


Unnamed: 0,p1_mean_hp,p1_mean_spe,p1_mean_atk,p1_mean_def,p1_mean_spd,p1_mean_spa,p2_lead_hp,p2_lead_spe,p2_lead_atk,p2_lead_def,p2_lead_spd,p2_lead_spa,spe_diff,battle_id,player_won
0,115.833333,80.0,72.5,63.333333,100.0,100.0,60,115,75,85,100,100,0,0,1
1,123.333333,61.666667,72.5,65.833333,90.0,90.0,55,120,50,45,135,135,-25,1,1
2,124.166667,65.833333,84.166667,71.666667,90.0,90.0,250,50,5,5,105,105,5,2,1
3,121.666667,75.833333,77.5,65.833333,103.333333,103.333333,75,110,100,95,70,70,0,3,1
4,114.166667,72.5,75.833333,79.166667,97.5,97.5,60,115,75,85,100,100,5,4,1


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define our features (X) and target (y)
features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]
X_train = train_df[features]
y_train = train_df['player_won']

# split()  method generate indices to split data into training and test set.
for count, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    print(f'Fold:{count}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    count += 1


X_test = test_df[features]

# Initialize and train the model
model = LogisticRegression(random_state=42, max_iter=1000)
score = cross_val_score(model, X_train, y_train, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

print("Cross validation complete.")

Fold:0, Train set: 8000, Test set:2000
Fold:1, Train set: 8000, Test set:2000
Fold:2, Train set: 8000, Test set:2000
Fold:3, Train set: 8000, Test set:2000
Fold:4, Train set: 8000, Test set:2000
Scores for each fold are: [0.543  0.537  0.5525 0.5205 0.5375]
Average score: 0.54
Cross validation complete.


In [4]:
# Initialize and train the model
print("Training a simple Logistic Regression model...")
model.fit(X_train, y_train)
print("Model training complete.")


# Make predictions on the test data
print("Generating predictions on the test set...")
test_predictions = model.predict(X_test)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'battle_id': test_df['battle_id'],
    'player_won': test_predictions
})

# Save the DataFrame to a .csv file
submission_df.to_csv('submission.csv', index=False)

print("\n'submission.csv' file created successfully!")
display(submission_df.head())

Training a simple Logistic Regression model...
Model training complete.
Generating predictions on the test set...

'submission.csv' file created successfully!


Unnamed: 0,battle_id,player_won
0,0,0
1,1,0
2,2,0
3,3,1
4,4,1
