In [1]:
# Import Libraries #
import findspark
findspark.init()
from pyspark.sql.functions import split, col, array_contains, translate, round, size, when, udf, lit, mean, count, format_number
from pyspark.sql.types import TimestampType, MapType, IntegerType, StringType, ArrayType, FloatType, StructField, StructType
from pyspark.sql import SparkSession
from helper import *

spark = SparkSession.builder.appName('test').master("yarn").getOrCreate()
## MUTE OUTPUT FROM SPARK
spark.sparkContext.setLogLevel("OFF")
#Event, White, Black, Result, WhiteElo, BlackElo, Opening, TimeControl, Termination, Moves,Eval, UTCTimestamp
schema = StructType([ \
    StructField("Event",StringType(),True), \
    StructField("White",StringType(),True), \
    StructField("Black",StringType(),True), \
    StructField("Result", StringType(), True), \
    StructField("WhiteElo", IntegerType(), True), \
    StructField("BlackElo", IntegerType(), True), \
    StructField("Opening",StringType(),True), \
    StructField("TimeControl",StringType(),True), \
    StructField("Termination",StringType(),True), \
    StructField("Moves", StringType(), True), \
    StructField("Eval", StringType(), True), \
    StructField("UTCTimestamp", TimestampType(), True) \
  ])
df = spark.read.csv("hdfs://namenode:9000/chess_2016_dataset/output/part*", schema=schema)

2023-03-31 11:01:59,957 INFO spark.SparkContext: Running Spark version 3.3.2
2023-03-31 11:02:00,359 INFO resource.ResourceUtils: No custom resources configured for spark.driver.
2023-03-31 11:02:00,360 INFO spark.SparkContext: Submitted application: test
2023-03-31 11:02:00,394 INFO resource.ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
2023-03-31 11:02:00,418 INFO resource.ResourceProfile: Limiting resource is cpus at 1 tasks per executor
2023-03-31 11:02:00,421 INFO resource.ResourceProfileManager: Added ResourceProfile id: 0
2023-03-31 11:02:00,517 INFO spark.SecurityManager: Changing view acls to: ubuntu
2023-03-31 11:02:00,518 INFO spark.SecurityManager: Changing modify acls to: ubuntu
2023-03-31 11:02:00,519 INFO spark.SecurityManage

In [None]:
spark

##### Check Shape of Data

In [None]:
print("shape: ", (df.count(), len(df.columns)))

##### Convert columns to appropriate types

In [None]:
df = convert_types(df)
df.dtypes

In [None]:
eval_difference = 3.0
eval_games = df.where(col("Eval")[0].isNotNull())
eval_games = eval_games.withColumn("WhiteBlunders", (find_white_blunders(col("Eval"), lit(eval_difference))))
eval_games = eval_games.withColumn("BlackBlunders", (find_black_blunders(col("Eval"), lit(eval_difference))))
eval_games.select("TimeControl", "White", "WhiteElo", "WhiteBlunders", "Black", "BlackElo", "BlackBlunders", "Result", "Termination") \
    .orderBy(col("WhiteBlunders").desc(), col("BlackBlunders").desc()).limit(10).toPandas().head(10)

##### Plot Most Blundered Game

In [None]:
plot_eval_game(eval_games)

### Group By
- Timecontrol ~ (60, 120, 180, 600) etc...
- Elo-Brackets ~ ([1200, 1400], [1500, 1700], [2000-2200]) etc...
### --> ERLEND WORK HERE YOU SCUM <--

In [None]:
time_control_white_blunders_averages = eval_games.groupBy("TimeControl").agg(mean("WhiteBlunders"), count("TimeControl")).withColumn("avg(WhiteBlunders)", format_number("avg(WhiteBlunders)", 1))
time_control_black_blunders_averages = eval_games.groupBy("TimeControl").agg(mean("BlackBlunders"), count("TimeControl")).withColumn("avg(BlackBlunders)", format_number("avg(BlackBlunders)", 1))
time_control_white_blunders_averages.orderBy(col("avg(WhiteBlunders)").desc()).where(col("count(TimeControl)")>10000).limit(10).toPandas().head()

In [None]:
time_control_black_blunders_averages.orderBy(col("avg(BlackBlunders)").desc()).where(col("count(TimeControl)")>100000).limit(10).toPandas().head()

In [None]:
a = eval_games.select("Moves") \
    .orderBy(col("WhiteBlunders").desc(), col("BlackBlunders").desc()).limit(1).take(1)[0][0]
print([x.replace("'","").replace('"', "").strip("'") for x in a])