# Chess Puzzle Collection and Preprocessing

### Obtaining PGN game data of a randomly picked daily puzzle

In [35]:
import json
import pandas as pd

Using Chess.com API endpoint: `get_random_daily_puzzle` to get PGN game data for chess puzzles.

In [36]:
import json
import time
from chessdotcom import get_random_daily_puzzle

puzzles = []

# Fetch 1000 random puzzles
for i in range(1000):
    response = get_random_daily_puzzle().json
    puzzles.append(response)
    time.sleep(1)  # Add a 1-second delay between requests

with open("puzzles.json", "w") as file:
    json.dump(puzzles, file, indent=4)
    

print(f"Fetched and saved {len(puzzles)} puzzles.")


Fetched and saved 1000 puzzles.


Cleaning puzzle data by removing unnecessary fields.

In [37]:
# Fields to remove
fields_to_remove = ['comments', 'url', 'publish_time', 'image']

# Iterate over each puzzle entry and remove the specified fields
for puzzle_entry in puzzles:
    puzzle = puzzle_entry['puzzle']
    for field in fields_to_remove:
        puzzle.pop(field, None)  # Safely remove the field if it exists

# Output the modified data
with open("puzzles_cleaned.json", "w") as file:
    json.dump(puzzles, file, indent=4)

Removing duplicates.

In [38]:
# Set to track unique titles
unique_titles = set()
unique_puzzles = []

# Filter out puzzles with duplicate titles
for puzzle_entry in puzzles:
    title = puzzle_entry['puzzle']['title']
    if title not in unique_titles:
        unique_puzzles.append(puzzle_entry)
        unique_titles.add(title)

# Write the unique puzzles to a new JSON file
with open("unique_puzzles.json", "w") as file:
    json.dump(unique_puzzles, file, indent=4)