# Chess Opening Recommender : Feature Engineering

## Overview
Transform raw game data (`user_games_df`, `elite_df`) into quantitative style features.  
We’ll compute per‑game metrics, then aggregate to per‑player summaries.

In [6]:
import numpy as np
import pandas as pd
import chess
from tqdm import tqdm
from pathlib import Path

In [7]:
DATA_DIR = Path("/Users/nicholasvega/Downloads/chess-opening-recommender/src/data")

### 2.1 Extract Style Features for the User

Compute per‑game features for the target user:

In [27]:
def extract_style_features(games_df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a DataFrame with columns:
      - 'moves' (list of UCI strings or iterable)
      - 'result' (string: '1-0', '0-1', '1/2-1/2')
    Compute per-game style features:
      - ply_count: total number of plies (half-moves)
      - avg_trades: number of captures per game
      - first_queen_ply: ply index of first queen move (or ply_count+1 if never moved)
      - castled_early: bool, True if castled by ply 20
      - checks: number of checks delivered
      - result_score: numeric game result (1.0=win, 0.5=draw, 0.0=loss)
    Returns:
      A new DataFrame with these features appended to the original columns.
    """
    records = []
    score_map = {"1-0": 1.0, "1/2-1/2": 0.5, "½-½": 0.5, "0-1": 0.0}

    for _, row in tqdm(games_df.iterrows(), total=len(games_df), desc="Extracting features"):
        # Coerce moves to list
        raw_moves = row['moves']
        try:
            moves = list(raw_moves)
        except TypeError:
            moves = []
        result = row.get('result', '')
        board = chess.Board()

        trades = 0
        checks = 0
        first_queen = None
        castled_ply = None

        for ply, uci in enumerate(moves, start=1):
            try:
                move = chess.Move.from_uci(uci)
            except Exception:
                continue
            if board.is_capture(move):
                trades += 1
            board.push(move)
            # queen deployment
            if first_queen is None:
                piece = board.piece_at(move.to_square)
                if piece and piece.piece_type == chess.QUEEN:
                    first_queen = ply
            # detect castling by loss of castling rights
            if castled_ply is None:
                if (not board.has_kingside_castling_rights(chess.WHITE)
                    and not board.has_queenside_castling_rights(chess.WHITE)
                    and not board.has_kingside_castling_rights(chess.BLACK)
                    and not board.has_queenside_castling_rights(chess.BLACK)):
                    castled_ply = ply
            # checks
            if board.is_check():
                checks += 1

        ply_count = len(moves)
        first_q = first_queen or (ply_count + 1)
        castled_early = bool(castled_ply and castled_ply <= 20)
        result_score = score_map.get(result, 0.0)

        rec = row.to_dict()
        rec.update({
            'ply_count': ply_count,
            'avg_trades': trades,
            'first_queen_ply': first_q,
            'castled_early': castled_early,
            'checks': checks,
            'result_score': result_score
        })
        records.append(rec)

    return pd.DataFrame(records)

Example

In [28]:
import ast 

PATH_TO_USER_GAMES = DATA_DIR / "Chessanonymous1_games.parquet"
user_games_df = pd.read_parquet(PATH_TO_USER_GAMES)

user_features_df = extract_style_features(user_games_df)
display(user_features_df.head())

Extracting features: 100%|██████████| 300/300 [00:00<00:00, 2724.87it/s]


Unnamed: 0,white,black,result,eco,opening,utc_date,utc_time,time_control,moves,ply_count,avg_trades,first_queen_ply,castled_early,checks,result_score
0,Chessanonymous1,yasinka2016,0-1,D00,Queen's Pawn Game: Levitsky Attack,2025.07.22,03:05:34,180+0,"[d2d4, d7d5, c1g5, h7h6, g5h4, c7c6, c2c3, d8b...",100,17,8,False,4,0.0
1,Jaybeth,Chessanonymous1,0-1,B12,Caro-Kann Defense: Modern Variation,2025.07.20,17:37:33,180+0,"[e2e4, c7c6, d2d4, d7d5, b1d2, a7a6, e4e5, c8f...",108,16,23,False,4,0.0
2,Chessanonymous1,BrusnacK,0-1,A40,Horwitz Defense,2025.07.20,17:34:44,180+0,"[d2d4, e7e6, c1g5, d8g5, g1f3, g5d8, e2e3, d7d...",82,20,4,False,4,0.0
3,Chessanonymous1,Countryside,0-1,D00,Queen's Pawn Game: Levitsky Attack,2025.07.20,16:34:23,180+0,"[d2d4, d7d5, c1g5, b8c6, f2f4, c8f5, e2e3, d8d...",50,9,8,False,1,0.0
4,Countryside,Chessanonymous1,1-0,A00,Kádas Opening,2025.07.20,16:29:42,180+0,"[h2h4, d7d5, h4h5, c7c6, d2d4, c8f5, g1f3, b8d...",61,13,18,False,1,1.0


### 2.2 Summarize User Style

In [29]:


def summarize_player_features(features_df: pd.DataFrame) -> pd.Series:
    """
    Aggregate per-game feature DataFrame into a single style vector (mean of each numeric feature).
    Returns a pandas Series keyed by feature name.
    """
    summary = {
        'avg_moves':        features_df['ply_count'].mean(),
        'pct_long_games':   (features_df['ply_count'] > 80).mean(),
        'avg_trades':       features_df['avg_trades'].mean(),
        'avg_queen_move':   features_df['first_queen_ply'].mean(),
        'pct_castled_early':features_df['castled_early'].mean(),
        'avg_checks':       features_df['checks'].mean(),
        'win_rate':         features_df['result_score'].mean(),
        'pct_wins':         (features_df['result_score'] == 1.0).mean(),
        'pct_draws':        (features_df['result_score'] == 0.5).mean(),
        'pct_losses':       (features_df['result_score'] == 0.0).mean(),
    }
    return pd.Series(summary)


In [30]:
user_style_vector = summarize_player_features(user_features_df)
user_style_vector

avg_moves            80.790000
pct_long_games        0.443333
avg_trades           17.223333
avg_queen_move       18.346667
pct_castled_early     0.043333
avg_checks            5.433333
win_rate              0.503333
pct_wins              0.476667
pct_draws             0.053333
pct_losses            0.470000
dtype: float64

### 2.3 Extract Style features per user in elite df

In [31]:
elite_df = pd.read_parquet(DATA_DIR / "lichess_elite_2025-05.parquet")
elite_features_df = extract_style_features(elite_df)
elite_features_df

Extracting features: 100%|██████████| 500/500 [00:00<00:00, 2766.99it/s]


Unnamed: 0,white,black,result,eco,opening,utc_date,utc_time,time_control,moves,ply_count,avg_trades,first_queen_ply,castled_early,checks,result_score
0,eNErGyOFbEiNGbOT,Nikitosik-ai,1/2-1/2,A00,Clemenz Opening,2025.05.01,00:00:15,180+0,"[h2h3, e7e5, e2e4, g8f6, b1c3, f8b4, a2a3, b4a...",98,19,44,True,6,0.5
1,Chessanonymous1,Ariel_mlr,1-0,A45,Trompowsky Attack,2025.05.01,00:00:54,180+0,"[d2d4, g8f6, c1g5, d7d5, g5f6, e7f6, e2e3, f8d...",81,16,17,False,5,1.0
2,Kyreds_pet,OlympusCz,1-0,B90,"Sicilian Defense: Najdorf Variation, English A...",2025.05.01,00:00:45,180+0,"[e2e4, c7c5, g1f3, d7d6, d2d4, c5d4, f3d4, g8f...",191,25,17,True,14,1.0
3,rtahmass,Mettigel,0-1,C72,"Ruy Lopez: Morphy Defense, Modern Steinitz Def...",2025.05.01,00:01:09,180+0,"[e2e4, e7e5, g1f3, b8c6, f1b5, a7a6, b5a4, d7d...",30,3,20,False,2,0.0
4,CruelKen,tomlesspit,1/2-1/2,D38,"Queen's Gambit Declined: Ragozin Defense, Alek...",2025.05.01,00:01:12,180+2,"[g1f3, d7d5, d2d4, g8f6, c2c4, e7e6, b1c3, f8b...",54,15,9,False,6,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Ukraine-team-creator,Nikitosik-ai,1/2-1/2,A00,Hungarian Opening,2025.05.01,02:11:52,180+0,"[g2g3, d7d5, d2d4, g8f6, a2a3, e7e6, c2c4, d5c...",65,12,9,False,1,0.5
496,Elretornodelmore,chac_sparrow,0-1,B43,"Sicilian Defense: Kan Variation, Knight Variation",2025.05.01,02:11:49,180+0,"[e2e4, c7c5, g1f3, e7e6, d2d4, c5d4, f3d4, a7a...",53,12,10,False,1,0.0
497,colinbot,Fruity23,1/2-1/2,C65,Ruy Lopez: Berlin Defense,2025.05.01,02:11:39,180+2,"[e2e4, e7e5, g1f3, b8c6, f1b5, g8f6, d2d3, f8c...",80,17,14,False,2,0.5
498,mmc1975,ShramovIgor,1-0,B01,Scandinavian Defense: Gubinsky-Melts Defense,2025.05.01,02:11:29,180+2,"[e2e4, d7d5, e4d5, d8d5, b1c3, d5d6, d2d4, g8f...",69,17,4,True,1,1.0


### 2.4 Next Steps 

With these style vectors, we can:

1. **Cluster** the elite players by their feature vectors (e.g. K‑Means) to discover style archetypes.
2. **Compute distances** between the user’s vector and each elite player’s vector to find stylistic neighbors.