In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv("/mnt/d/datasets/puzzles/lichess_db_puzzle.csv")
df = df[["FEN", "Moves", "Rating", "Themes"]]
df = df[df["Rating"] < 3000]
df["Bucket"] = df["Rating"] // 100
df

Unnamed: 0,FEN,Moves,Rating,Themes,Bucket
0,r6k/pp2r2p/4Rp1Q/3p4/8/1N1P2R1/PqP2bPP/7K b - ...,f2g3 e6e7 b2b1 b3c1 b1c1 h6c1,1995,crushing hangingPiece long middlegame,19
1,5rk1/1p3ppp/pq3b2/8/8/1P1Q1N2/P4PPP/3R2K1 w - ...,d3d6 f8d8 d6d8 f6d8,1452,advantage endgame short,14
2,8/4R3/1p2P3/p4r2/P6p/1P3Pk1/4K3/8 w - - 1 64,e7f7 f5e5 e2f1 e5e6,1276,advantage endgame rookEndgame short,12
3,r2qr1k1/b1p2ppp/pp4n1/P1P1p3/4P1n1/B2P2Pb/3NBP...,b6c5 e2g4 h3g4 d1g4,1099,advantage middlegame short,10
4,8/8/4k1p1/2KpP2p/5PP1/8/8/8 w - - 0 53,g4h5 g6h5 f4f5 e6e5 f5f6 e5f6,1556,crushing endgame long pawnEndgame,15
...,...,...,...,...,...
3764374,3r3k/p5pp/8/5R2/1BQ1p3/P3q3/Bb4PP/6K1 w - - 0 28,g1f1 d8d1 b4e1 e3e1,1025,mate mateIn2 middlegame short,10
3764375,5Q2/pp3R1P/1kpp4/4p3/2P1P3/3PP2P/Pr2q3/2K5 w -...,f7f2 b2c2 c1b1 e2d1,1674,endgame mate mateIn2 queensideAttack short,16
3764376,r3kb1r/ppp2ppp/2n5/3q3b/3P1B2/5N1P/PPP3P1/RN1Q...,c6d4 f1e1 e8d8 b1c3 d4f3 g2f3,2507,advantage long opening,25
3764377,r2q1rk1/4bppp/p1n1pn2/1p1pN3/2pP2b1/1PP1P3/PBQ...,g4f5 e5c6 f5c2 c6d8 c2b1 d8c6,2256,crushing hangingPiece long middlegame,22


In [3]:
from itertools import chain
unique_lists_in_items = df.Themes.str.split(" ").apply(lambda x: frozenset(x)).unique().tolist()
all_themes = set(chain(*unique_lists_in_items))
all_themes

{'advancedPawn',
 'advantage',
 'anastasiaMate',
 'arabianMate',
 'attackingF2F7',
 'attraction',
 'backRankMate',
 'bishopEndgame',
 'bodenMate',
 'capturingDefender',
 'castling',
 'clearance',
 'crushing',
 'defensiveMove',
 'deflection',
 'discoveredAttack',
 'doubleBishopMate',
 'doubleCheck',
 'dovetailMate',
 'enPassant',
 'endgame',
 'equality',
 'exposedKing',
 'fork',
 'hangingPiece',
 'hookMate',
 'interference',
 'intermezzo',
 'kingsideAttack',
 'knightEndgame',
 'long',
 'master',
 'masterVsMaster',
 'mate',
 'mateIn1',
 'mateIn2',
 'mateIn3',
 'mateIn4',
 'mateIn5',
 'middlegame',
 'oneMove',
 'opening',
 'pawnEndgame',
 'pin',
 'promotion',
 'queenEndgame',
 'queenRookEndgame',
 'queensideAttack',
 'quietMove',
 'rookEndgame',
 'sacrifice',
 'short',
 'skewer',
 'smotheredMate',
 'superGM',
 'trappedPiece',
 'underPromotion',
 'veryLong',
 'xRayAttack',
 'zugzwang'}

In [4]:
# sample at least 100 puzzles for each theme
rows = []
for theme in tqdm(all_themes):
    rows.append(df[df.Themes.str.contains(theme)].sample(100))
df_themes = pd.concat(rows)

# sample at least 100 puzzles for each rating bucket
df_ratings = df.groupby("Bucket").sample(100)

# remove duplicates
df_sampled = pd.concat([df_themes, df_ratings]).drop_duplicates()
df_sampled = df_sampled.sample(frac=1).reset_index(drop=True) # shuffle

100%|██████████| 60/60 [01:03<00:00,  1.05s/it]


In [5]:
df_downsampled = df_sampled.copy()

# try to reduce the number of rows by removing puzzles and checking the counts
for i in tqdm(range(df_downsampled.shape[0])):
    # remove index i
    df_without_i = df_downsampled.drop(df_downsampled.iloc[i].name)

    still_ok = True

    # check if the number of unique puzzles is at least 100
    for theme in all_themes:
        if len(df_without_i[df_without_i.Themes.str.contains(theme)]) < 100:
            still_ok = False
            break
    # check if the number of buckets is at least 100
    if df_without_i.groupby("Bucket").count().min()[0] < 100:
        still_ok = False
    
    if still_ok:
        df_downsampled = df_without_i

df_downsampled
# the error below is expected, run the next cell

 56%|█████▋    | 4897/8691 [15:04<11:40,  5.41it/s] 


IndexError: single positional indexer is out-of-bounds

In [6]:
sub_df = df_downsampled[["FEN", "Moves", "Rating", "Themes"]]
sub_df.to_csv("puzzles.csv", index=False, header=False)