In [5]:
import os
import chess.pgn # might need to pip install 'chess'
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, count, col, regexp_extract
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
import zstandard as zstd
import io


In [6]:
file_path = '/sfs/gpfs/tardis/home/zrc3hc/Chess/lichess_db_standard_rated_2024-08.pgn.zst'
# png = 'portable game notation' which is standard formate for chess games.

In [8]:
# Checking if file can be accessed 

if os.path.exists(file_path):
    print("File exists and can be accessed.")
else:
    print("File does not exist or cannot be accessed.")

File exists and can be accessed.


In [9]:
# Decompress and read the PGN file
with open(file_path, 'rb') as compressed_file:
    dctx = zstd.ZstdDecompressor()
    
    # Decompress into a memory buffer
    with dctx.stream_reader(compressed_file) as reader:
        decompressed_stream = io.TextIOWrapper(reader, encoding='utf-8')
        
        # Read the first few lines
        for _ in range(20):
            line = decompressed_stream.readline()
            if line:
                print(line.strip())
            else:
                break

[Event "Rated Bullet game"]
[Site "https://lichess.org/nQ1xYNSF"]
[Date "2024.08.01"]
[Round "-"]
[White "kingskreamer"]
[Black "mysteryvabs"]
[Result "1-0"]
[UTCDate "2024.08.01"]
[UTCTime "00:00:09"]
[WhiteElo "2148"]
[BlackElo "2155"]
[WhiteRatingDiff "+6"]
[BlackRatingDiff "-6"]
[ECO "B10"]
[Opening "Caro-Kann Defense: Accelerated Panov Attack"]
[TimeControl "60+0"]
[Termination "Time forfeit"]

1. e4 { [%clk 0:01:00] } 1... c6 { [%clk 0:01:00] } 2. c4 { [%clk 0:00:59] } 2... d5 { [%clk 0:01:00] } 3. cxd5 { [%clk 0:00:59] } 3... cxd5 { [%clk 0:01:00] } 4. exd5 { [%clk 0:00:58] } 4... Qxd5 { [%clk 0:00:59] } 5. Nc3 { [%clk 0:00:58] } 5... Qd8 { [%clk 0:00:59] } 6. Bc4 { [%clk 0:00:58] } 6... Nf6 { [%clk 0:00:59] } 7. Qb3 { [%clk 0:00:57] } 7... e6 { [%clk 0:00:58] } 8. Nf3 { [%clk 0:00:57] } 8... Nc6 { [%clk 0:00:57] } 9. Bb5 { [%clk 0:00:55] } 9... Bd7 { [%clk 0:00:57] } 10. O-O { [%clk 0:00:54] } 10... Rc8 { [%clk 0:00:56] } 11. Re1 { [%clk 0:00:52] } 11... a6 { [%clk 0:00:56] } 1

### PGN Headers

- **[Event "Rated Bullet game"]** - The event is a rated bullet chess game.
- **[Site "https://lichess.org/nQ1xYNSF"]** - The game URL.
- **[Date "2024.08.01"]** - The date when the game occurred.
- **[Round "-"]** - Likely indicates if repeated games were played.
- **[White "kingskreamer"]** - The white player.
- **[Black "mysteryvabs"]** - The black player.
- **[Result "1-0"]** - The result of the game (White won).
- **[UTCDate "2024.08.01"]** - The date of the game in UTC.
- **[UTCTime "00:00:09"]** - The time of the game in UTC.
- **[WhiteElo "2148"]** - White player's Elo rating before the game.
- **[BlackElo "2155"]** - Black player's Elo rating before the game.
- **[WhiteRatingDiff "+6"]** - Changes in White's Elo after the game (+6).
- **[BlackRatingDiff "-6"]** - Changes in Black's Elo after the game (-6).
- **[ECO "B10"]** - 'B10' refers to a specific opening (Caro-Kann Defense).
- **[Opening "Caro-Kann Defense: Accelerated Panov Attack"]** - The specific opening played.
- **[TimeControl "60+0"]** - The time control was 60 seconds per player with 0 seconds increment.
- **[Termination "Time forfeit"]** - The game ended because one player ran out of time.


### 2. The Moves
This part records the actual moves played in the game. Chess moves are written using standard algebraic notation, and the clocks of both players are tracked after each move (using the `[%clk]` annotation).

For example:

- **1. e4 { [%clk 0:01:00] }**: White plays pawn to e4, and their clock is at 1 minute.
- **1... c6 { [%clk 0:01:00] }**: Black responds with c6 (a pawn move), and their clock remains at 1 minute.
- **2. c4 { [%clk 0:00:59] }**: White moves pawn to c4, with their clock now at 59 seconds.
- **3. cxd5 { [%clk 0:00:59] }**: White captures the pawn on d5, clock at 59 seconds.
- **5. Nc3 { [%clk 0:00:58] }**: White develops their knight to c3, clock at 58 seconds.

The moves continue with annotations showing how much time each player has left after their moves.


In [34]:
def zip_pgn_to_json(pgn_file_zipped, limit=10):
    games_list = []
    game_count = 0  # counter to keep track of how many games have been parsed
    
    # Open the compressed file and set up Zstandard decompression
    with open(pgn_file_zipped, 'rb') as compressed_file:
        dctx = zstd.ZstdDecompressor()
        
        # Decompress the stream in memory
        with dctx.stream_reader(compressed_file) as reader:
            decompressed_stream = io.TextIOWrapper(reader, encoding='utf-8')
            
            # Read the PGN games from the decompressed stream
            while True:
                game = chess.pgn.read_game(decompressed_stream)
                if game is None:
                    break
                
                # Convert the game to dictionary, selecting specific headers
                game_info = {
                    "Event": game.headers.get("Event", ""),
                    "Date": game.headers.get("Date", ""),
                    "Result": game.headers.get("Result", ""),
                    "WhiteElo": game.headers.get("WhiteElo", ""),
                    "BlackElo": game.headers.get("BlackElo", ""),
                    "Moves": [move.uci() for move in game.mainline_moves()]
                }
                
                games_list.append(game_info)
                game_count += 1
                
                # Stop when the limit is reached
                if game_count >= limit:
                    break

    return games_list


In [35]:
games_json = zip_pgn_to_json(file_path, limit = 1000)


In [36]:
# Showing first account

print(json.dumps(games_json[0], indent=4))


{
    "Event": "Rated Bullet game",
    "Date": "2024.08.01",
    "Result": "1-0",
    "WhiteElo": "2148",
    "BlackElo": "2155",
    "Moves": [
        "e2e4",
        "c7c6",
        "c2c4",
        "d7d5",
        "c4d5",
        "c6d5",
        "e4d5",
        "d8d5",
        "b1c3",
        "d5d8",
        "f1c4",
        "g8f6",
        "d1b3",
        "e7e6",
        "g1f3",
        "b8c6",
        "c4b5",
        "c8d7",
        "e1g1",
        "a8c8",
        "f1e1",
        "a7a6",
        "b5a4",
        "b7b5",
        "c3b5",
        "a6b5",
        "a4b5",
        "f8e7",
        "f3e5",
        "e8g8",
        "e5d7",
        "d8d7",
        "b3f3",
        "d7c7",
        "b5e2",
        "c6e5",
        "f3g3",
        "e5g6",
        "f2f4",
        "c7f4",
        "g3f4",
        "g6f4",
        "d2d3",
        "f4e2",
        "e1e2",
        "c8c7",
        "c1e3",
        "f8c8",
        "a1f1",
        "f6d5",
        "e3d4",
        "c7c1",
        "g1f2",
      

In [37]:
spark = SparkSession.builder \
    .appName("PGN Game Distribution") \
    .getOrCreate()



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/23 13:05:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [41]:
schema = StructType([
    StructField("Event", StringType(), True),
    StructField("Site", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Result", StringType(), True),
    StructField("WhiteElo", StringType(), True),  
    StructField("BlackElo", StringType(), True),  
    StructField("Moves", ArrayType(StringType()), True)  
])


In [39]:
df = spark.createDataFrame(games_json, schema=schema)


In [40]:
df.show(5, truncate=10) 

[Stage 0:>                                                          (0 + 1) / 1]

+----------+----+----------+------+--------+--------+----------+
|     Event|Site|      Date|Result|WhiteElo|BlackElo|     Moves|
+----------+----+----------+------+--------+--------+----------+
|Rated B...|null|2024.08.01|   1-0|    2148|    2155|[e2e4, ...|
|Rated B...|null|2024.08.01|   1-0|    1103|    1106|[e2e4, ...|
|Rated B...|null|2024.08.01|   0-1|     674|     629|[e2e4, ...|
|Rated B...|null|2024.08.01|   0-1|    2459|    2556|[e2e4, ...|
|Rated B...|null|2024.08.01|   0-1|    1527|    1500|[e2e4, ...|
+----------+----+----------+------+--------+--------+----------+
only showing top 5 rows



                                                                                

In [42]:
event_distribution = df.groupBy("Event").count()


In [43]:
event_distribution.show(truncate=False)


+---------------------------------------------------------------+-----+
|Event                                                          |count|
+---------------------------------------------------------------+-----+
|Classical swiss https://lichess.org/swiss/bRGB9CDW             |6    |
|Rated Blitz tournament https://lichess.org/tournament/x2P6n5ZA |25   |
|Rated Blitz tournament https://lichess.org/tournament/MIDvhT7D |20   |
|Rated Bullet tournament https://lichess.org/tournament/CReo1mLT|1    |
|Rated Classical game                                           |5    |
|Rated Blitz tournament https://lichess.org/tournament/9ItBGKQi |3    |
|Rated Blitz tournament https://lichess.org/tournament/3YFuR01B |34   |
|Rated Rapid tournament https://lichess.org/tournament/lPeiflQY |2    |
|Rated Bullet tournament https://lichess.org/tournament/FjIuM2tM|3    |
|Blitz swiss https://lichess.org/swiss/YqQwqdKB                 |4    |
|Rated Blitz game                                               

Some of the events are specified by tournaments, but I want to treat each related tournament as the same. Below is cleaning up the hyperlinks

In [44]:
from pyspark.sql.functions import regexp_replace

# Remove the hyperlinks using a regular expression
df_cleaned = df.withColumn("Event", regexp_replace("Event", r' http.*$', ''))

# Now group by the cleaned 'Event' column and count occurrences
event_distribution_cleaned = df_cleaned.groupBy("Event").count()

# Show the distribution of cleaned events
event_distribution_cleaned.show(truncate=False)

+-----------------------+-----+
|Event                  |count|
+-----------------------+-----+
|Rated Bullet tournament|22   |
|Rated Classical game   |5    |
|Rapid swiss            |3    |
|Rated Blitz tournament |93   |
|Rated Rapid tournament |6    |
|Rated Blitz game       |421  |
|Classical swiss        |6    |
|Rated UltraBullet game |3    |
|Rated Rapid game       |115  |
|Blitz swiss            |10   |
|Rated Bullet game      |316  |
+-----------------------+-----+



In [16]:
# Getting png file that only contains Rated Blitz Game


In [46]:
def zipped_pgn_to_json_filtered(pgn_file, event_filter="Rated Blitz game", limit=10):
    games_list = []
    game_count = 0  # Counter to keep track of how many games have been parsed
    
    # Open the compressed file and set up Zstandard decompression
    with open(pgn_file, 'rb') as compressed_file:
        dctx = zstd.ZstdDecompressor()
        
        # Decompress the stream in memory
        with dctx.stream_reader(compressed_file) as reader:
            decompressed_stream = io.TextIOWrapper(reader, encoding='utf-8')
            
            # Read the PGN games from the decompressed stream
            while True:
                game = chess.pgn.read_game(decompressed_stream)
                if game is None:
                    break
                
                # Filter games by the 'Event' header
                if game.headers.get("Event") != event_filter:
                    continue  # Skip this game if it doesn't match the event
                
                # Convert the game to a dictionary, selecting specific headers
                game_info = {
                    "Event": game.headers.get("Event", ""),
                    "Site": game.headers.get("Site", ""),
                    "Date": game.headers.get("Date", ""),
                    "Result": game.headers.get("Result", ""),
                    "WhiteElo": game.headers.get("WhiteElo", ""),
                    "BlackElo": game.headers.get("BlackElo", ""),
                    "Moves": [move.uci() for move in game.mainline_moves()]  # UCI formatted moves
                }
                
                games_list.append(game_info)
                game_count += 1
                
                # Stop when the limit is reached
                if game_count >= limit:
                    break

    return games_list

In [47]:
filtered_games_json = zipped_pgn_to_json_filtered(file_path, event_filter="Rated Blitz game", limit=1100)
print(json.dumps(filtered_games_json[0], indent=4))


{
    "Event": "Rated Blitz game",
    "Site": "https://lichess.org/wKUV4KuU",
    "Date": "2024.08.01",
    "Result": "1-0",
    "WhiteElo": "2055",
    "BlackElo": "2069",
    "Moves": [
        "d2d4",
        "e7e6",
        "e2e4",
        "d7d5",
        "e4d5",
        "e6d5",
        "g1f3",
        "f8d6",
        "f1d3",
        "g8f6",
        "e1g1",
        "e8g8",
        "b2b3",
        "c8g4",
        "c2c4",
        "c7c6",
        "c4c5",
        "d6c7",
        "c1e3",
        "f8e8",
        "b1d2",
        "b8d7",
        "d1c2",
        "d7f8",
        "h2h3",
        "g4h5",
        "g2g4",
        "h5g6",
        "g4g5",
        "f6h5",
        "d3g6",
        "h7g6",
        "h3h4",
        "d8d7",
        "f1e1",
        "d7g4",
        "g1f1",
        "e8e3",
        "f2e3",
        "h5g3",
        "f1f2",
        "a8e8",
        "e1g1",
        "g4h3",
        "g1g2",
        "f8e6",
        "a1g1",
        "g3f5",
        "f3e5",
        "c7e5",
        "d4

In [54]:
df_blitz = spark.createDataFrame(filtered_games_json, schema=schema)
df_blitz.show(10, truncate=20) 


+----------------+--------------------+----------+------+--------+--------+--------------------+
|           Event|                Site|      Date|Result|WhiteElo|BlackElo|               Moves|
+----------------+--------------------+----------+------+--------+--------+--------------------+
|Rated Blitz game|https://lichess.o...|2024.08.01|   1-0|    2055|    2069|[d2d4, e7e6, e2e4...|
|Rated Blitz game|https://lichess.o...|2024.08.01|   0-1|     886|     947|[e2e4, e7e5, g1f3...|
|Rated Blitz game|https://lichess.o...|2024.08.01|   1-0|    1721|    1698|[e2e4, e7e5, f2f4...|
|Rated Blitz game|https://lichess.o...|2024.08.01|   0-1|    1920|    1922|[c2c4, e7e5, b1c3...|
|Rated Blitz game|https://lichess.o...|2024.08.01|   0-1|    1764|    1791|[e2e4, e7e5, g1f3...|
|Rated Blitz game|https://lichess.o...|2024.08.01|   1-0|    1664|    1652|[e2e4, e7e5, f1c4...|
|Rated Blitz game|https://lichess.o...|2024.08.01|   0-1|    2124|    2252|[e2e4, c7c5, b1c3...|
|Rated Blitz game|https://lich

In [20]:
df_blitz_count = df_blitz.count()
print(df_blitz_count)

1100


In [26]:
# Creating Average Elo Column

In [35]:
df_blitz = df_blitz.withColumn("WhiteElo", col("WhiteElo").cast("int")) \
                   .withColumn("BlackElo", col("BlackElo").cast("int")) \
                   .withColumn("AverageElo", (col("WhiteElo") + col("BlackElo")) / 2)

In [37]:
df_blitz.show(5, truncate=10)


+----------+----------+----------+------+--------+--------+----------+----------+
|     Event|      Site|      Date|Result|WhiteElo|BlackElo|     Moves|AverageElo|
+----------+----------+----------+------+--------+--------+----------+----------+
|Rated B...|https:/...|2024.08.01|   1-0|    2055|    2069|[d2d4, ...|    2062.0|
|Rated B...|https:/...|2024.08.01|   0-1|     886|     947|[e2e4, ...|     916.5|
|Rated B...|https:/...|2024.08.01|   1-0|    1721|    1698|[e2e4, ...|    1709.5|
|Rated B...|https:/...|2024.08.01|   0-1|    1920|    1922|[c2c4, ...|    1921.0|
|Rated B...|https:/...|2024.08.01|   0-1|    1764|    1791|[e2e4, ...|    1777.5|
+----------+----------+----------+------+--------+--------+----------+----------+
only showing top 5 rows



In [41]:
# Find Distribution of AverageElo

from pyspark.ml.feature import Bucketizer
splits = [0, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, float('Inf')]
bucketizer = Bucketizer(splits=splits, inputCol="AverageElo", outputCol="EloBin")
df_binned = bucketizer.transform(df_blitz)
df_binned.show(1)

+----------------+--------------------+----------+------+--------+--------+--------------------+----------+------+
|           Event|                Site|      Date|Result|WhiteElo|BlackElo|               Moves|AverageElo|EloBin|
+----------------+--------------------+----------+------+--------+--------+--------------------+----------+------+
|Rated Blitz game|https://lichess.o...|2024.08.01|   1-0|    2055|    2069|[d2d4, e7e6, e2e4...|    2062.0|  11.0|
+----------------+--------------------+----------+------+--------+--------+--------------------+----------+------+
only showing top 1 row



In [44]:
df_binned.groupBy("EloBin").agg(count("*").alias("count")).orderBy("EloBin").show(truncate=False)


+------+-----+
|EloBin|count|
+------+-----+
|0.0   |51   |
|1.0   |43   |
|2.0   |61   |
|3.0   |67   |
|4.0   |80   |
|5.0   |97   |
|6.0   |106  |
|7.0   |107  |
|8.0   |114  |
|9.0   |109  |
|10.0  |99   |
|11.0  |66   |
|12.0  |45   |
|13.0  |55   |
+------+-----+

