In [1]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Data Preprocessing

In [2]:
file_path = 'combined_saved_games.csv/part-00000-ba49024c-4aa1-44ec-baae-98be97d3b553-c000.csv'

spark = SparkSession.builder \
    .appName("benchmarkmodel") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.memory", "64g") \
    .config("spark.executor.memory", "64g") \
    .config("spark.sql.shuffle.partitions", "100") \
    .getOrCreate()

spark.conf.set("spark.sql.codegen.wholeStage", "false")

spark.conf.set("spark.sql.codegen.maxFields", "2000") 

# spark = SparkSession.builder.appName("benchmarkmodel").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark.read.csv(file_path, header = True, inferSchema = True)


spark = SparkSession.builder.appName("benchmarkmodel").getOrCreate()

df = spark.read.csv(file_path, header = True, inferSchema = True)



/opt/conda/lib/python3.7/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/03 19:51:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/03 19:51:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

In [3]:
df.show(2)

+----+-------+---------+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|Move|game_id|next_move|result| a1| b1| c1| d1| e1| f1| g1| h1| a2| b2| c2| d2| e2| f2| g2| h2| a3| b3| c3| d3| e3| f3| g3| h3| a4| b4| c4| d4| e4| f4| g4| h4| a5| b5| c5| d5| e5| f5| g5| h5| a6| b6| c6| d6| e6| f6| g6| h6| a7| b7| c7| d7| e7| f7| g7| h7| a8| b8| c8| d8| e8| f8| g8| h8|
+----+-------+---------+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|  83|  15380|     h7g8|     1|  0|  0|  0|  0|  0|  0|  0|  5| -5|  0|  0|  0|  0|  1|  0| 10|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0| 

In [4]:
df.count()

10018



# **Chess Piece Values**

| **Chess Piece**     | **Value** |
|----------------------|-----------|
| White Rook 1         | `5`       |
| White Rook 2         | `5`       |
| White Knight 1       | `3`       |
| White Knight 2       | `3`       |
| White Bishop 1       | `3`       |
| White Bishop 2       | `3`       |
| White Queen          | `9`       |
| White King           | `10`      |
| White Pawn 1â€“8       | `1`       |

**Note:** Black pieces have the same values as white pieces but are negative. 

**Note:** If the result is labeled 1, that means white won. If black won, result is labeled 0.


In [5]:
# Split data into training and testing sets
train, test = df.randomSplit([0.9, 0.1], seed=42)

# Prepare features and label
board_spots = [col for col in df.columns if col not in ['Move', 'game_id', 'next_move', 'result']]
vector_assembler = VectorAssembler(inputCols=board_spots, outputCol="features")

# Transform data
train = vector_assembler.transform(train)
test = vector_assembler.transform(test)

# # Ensure the 'result' column is numeric and rename it to 'label'
# train = train.withColumnRenamed("result", "label")
# test = test.withColumnRenamed("result", "label")

## Benchmark Model: Logistic Regression

In [6]:
logistic_regression = LogisticRegression(featuresCol="features", labelCol="result", maxIter=1000)
lr_model = logistic_regression.fit(train)
predictions = lr_model.transform(test)
predictions.select("features", "result", "prediction", "probability").show(5)

                                                                                

+--------------------+------+----------+--------------------+
|            features|result|prediction|         probability|
+--------------------+------+----------+--------------------+
|(64,[0,4,5,6,7,9,...|     0|       1.0|[0.40431716519803...|
|(64,[0,3,5,6,7,8,...|     1|       1.0|[0.42697934966022...|
|(64,[0,2,6,7,8,9,...|     0|       0.0|[0.82664399980251...|
|(64,[0,2,5,7,8,9,...|     1|       1.0|[0.47374769140373...|
|(64,[0,3,6,7,8,9,...|     0|       1.0|[0.38246962742278...|
+--------------------+------+----------+--------------------+
only showing top 5 rows



In [7]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="result", rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7084113980627702


## Intermediate Model: Random Forest

In [8]:
# Set up the Random Forest model
rf = RandomForestClassifier(labelCol="result", featuresCol="features", numTrees=100, maxDepth=10, seed=42)

# Train the model
model = rf.fit(train)

# Make predictions
predictions = model.transform(test)
predictions.select("features", "result", "prediction", "probability").show(5)

                                                                                

+--------------------+------+----------+--------------------+
|            features|result|prediction|         probability|
+--------------------+------+----------+--------------------+
|(64,[0,4,5,6,7,9,...|     0|       1.0|[0.45203580374873...|
|(64,[0,3,5,6,7,8,...|     1|       1.0|[0.34934689371371...|
|(64,[0,2,6,7,8,9,...|     0|       1.0|[0.47374637222721...|
|(64,[0,2,5,7,8,9,...|     1|       1.0|[0.44429477749447...|
|(64,[0,3,6,7,8,9,...|     0|       1.0|[0.37105811187865...|
+--------------------+------+----------+--------------------+
only showing top 5 rows



In [9]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="result", rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7643486034066835


## Intermediate Model: Multi Layer Perceptron

In [10]:
# Define the layers for the neural network
# Layers: [input layer size, hidden layer size(s), output layer size]
layers = [len(board_spots), 64, 32, 2]  # two hidden layers (64 and 32 nodes) and binary output

# Set up the Multilayer Perceptron Classifier
mlp = MultilayerPerceptronClassifier(labelCol="result", featuresCol="features", layers=layers, maxIter=100, blockSize=128, seed=42)

# Train the model
model = mlp.fit(train)

# Make predictions
predictions = model.transform(test)
predictions.select("features", "result", "prediction", "probability").show(5)

+--------------------+------+----------+--------------------+
|            features|result|prediction|         probability|
+--------------------+------+----------+--------------------+
|(64,[0,4,5,6,7,9,...|     0|       1.0|[0.00193470020062...|
|(64,[0,3,5,6,7,8,...|     1|       0.0|[0.99999987808343...|
|(64,[0,2,6,7,8,9,...|     0|       0.0|[0.99998542852464...|
|(64,[0,2,5,7,8,9,...|     1|       1.0|[9.63420204171194...|
|(64,[0,3,6,7,8,9,...|     0|       0.0|[0.99755168569054...|
+--------------------+------+----------+--------------------+
only showing top 5 rows



In [11]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="result", rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8188347231916535


## Champion Model: Gradient Boosting

In [12]:
GBT = GBTClassifier(featuresCol="features", labelCol="result", maxIter=200)
GBT_model = GBT.fit(train)

# Predictions on Validation Set
predictions_GBT = GBT_model.transform(test)
predictions_GBT.select("features", "result", "prediction", "probability").show(5)

+--------------------+------+----------+--------------------+
|            features|result|prediction|         probability|
+--------------------+------+----------+--------------------+
|(64,[0,4,5,6,7,9,...|     0|       0.0|[0.62109765050878...|
|(64,[0,3,5,6,7,8,...|     1|       1.0|[0.17347849337837...|
|(64,[0,2,6,7,8,9,...|     0|       1.0|[0.28916173699203...|
|(64,[0,2,5,7,8,9,...|     1|       1.0|[0.45405109181348...|
|(64,[0,3,6,7,8,9,...|     0|       1.0|[0.25885933362354...|
+--------------------+------+----------+--------------------+
only showing top 5 rows



In [13]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="result", rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions_GBT)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8857805860671296


In [14]:
## EXTRA ANALYSIS HERE