In [4]:
!unzip -q data_uci.pgn.zip
!unzip -q stockfish.csv.zip
!unzip -q sampleSubmission.csv.zip

replace data_uci.pgn? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace stockfish.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace sampleSubmission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [3]:

!pip install python-chess

import pandas as pd
import numpy as np
import chess.pgn
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

print("Parsing PGN file to extract Elo ratings...")
pgnfile = open('data_uci.pgn')

elodata = []
gameid = 1
while True:
    game = chess.pgn.read_game(pgnfile)
    if game is None:
        break
    try:
        whiteelo = int(game.headers['WhiteElo'])
        blackelo = int(game.headers['BlackElo'])
        elodata.append({'Event': gameid, 'WhiteElo': whiteelo, 'BlackElo': blackelo})
    except (ValueError, KeyError):
        pass
    gameid += 1

elodf = pd.DataFrame(elodata)
print(f"Successfully extracted ratings for {len(elodf)} games.")
print(elodf.head())


print("\nLoading stockfish data and engineering features...")
stockfish = pd.read_csv('stockfish.csv')


stockfish['movescores'] = stockfish['MoveScores'].apply(
    lambda s: [int(x) for x in s.split() if x != 'NA'] if isinstance(s, str) else []
)

def calculate_loss(scores):
    losses = []
    for i in range(1, len(scores)):
        loss = scores[i-1] - scores[i]
        if loss > 0:
            losses.append(loss)
    return np.mean(losses) if losses else 0

stockfish['avgloss'] = stockfish['movescores'].apply(calculate_loss)
stockfish['stddev'] = stockfish['movescores'].apply(lambda s: np.std(s) if s else 0)
stockfish['gamelength'] = stockfish['movescores'].apply(len)

print("Feature engineering complete. Sample features:")
print(stockfish[['Event', 'avgloss', 'stddev', 'gamelength']].head())


print("\nMerging features with Elo ratings...")
traindata = stockfish[stockfish['Event'] <= 25000].merge(elodf, on='Event')
testdata = stockfish[stockfish['Event'] > 25000]

print(f"Training data shape: {traindata.shape}")
print(f"Testing data shape: {testdata.shape}")

features = ['avgloss', 'stddev', 'gamelength']
xtrain = traindata[features]
ywhite = traindata['WhiteElo']
yblack = traindata['BlackElo']
xtest = testdata[features]

# Step 5: Train two separate models (one for White, one for Black)
print("\nTraining LightGBM models...")
modelwhite = lgb.LGBMRegressor(random_state=42)
modelwhite.fit(xtrain, ywhite)

modelblack = lgb.LGBMRegressor(random_state=42)
modelblack.fit(xtrain, yblack)

print("Models trained successfully.")


print("\nMaking predictions on the test set...")
predictionswhite = modelwhite.predict(xtest)
predictionsblack = modelblack.predict(xtest)

submission = pd.DataFrame({
    'Event': testdata['Event'],
    'WhiteElo': predictionswhite.astype(int),
    'BlackElo': predictionsblack.astype(int)
})

submission.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully!")
print(submission.head())

Parsing PGN file to extract Elo ratings...
Successfully extracted ratings for 25000 games.
   Event  WhiteElo  BlackElo
0      1      2354      2411
1      2      2523      2460
2      3      1915      1999
3      4      2446      2191
4      5      2168      2075

Loading stockfish data and engineering features...
Feature engineering complete. Sample features:
   Event     avgloss       stddev  gamelength
0      1    9.652174    26.431587          38
1      2    7.400000    11.539487          13
2      3  302.482759  2286.131735         106
3      4   15.641026    51.525093          77
4      5   27.058824   313.263595          49

Merging features with Elo ratings...
Training data shape: (25000, 8)
Testing data shape: (25000, 6)

Training LightGBM models...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 753
[LightGBM] [Info] Number of data po