In [1]:
# Import Libraries #
import findspark
findspark.init()
from pyspark.sql.functions import split, col, array_contains, translate, round, size, when, udf, lit, mean, count, format_number
from pyspark.sql.types import TimestampType, MapType, IntegerType, StringType, ArrayType, FloatType, StructField, StructType
from pyspark.sql import SparkSession
from helper import *
import math

spark = SparkSession.builder.appName('test').master("yarn").getOrCreate()
#Event, White, Black, Result, WhiteElo, BlackElo, Opening, TimeControl, Termination, Moves,Eval, UTCTimestamp
schema = StructType([ \
    StructField("Event",StringType(),True), \
    StructField("White",StringType(),True), \
    StructField("Black",StringType(),True), \
    StructField("Result", StringType(), True), \
    StructField("WhiteElo", IntegerType(), True), \
    StructField("BlackElo", IntegerType(), True), \
    StructField("Opening",StringType(),True), \
    StructField("TimeControl",StringType(),True), \
    StructField("Termination",StringType(),True), \
    StructField("Moves", StringType(), True), \
    StructField("Eval", StringType(), True), \
    StructField("UTCTimestamp", TimestampType(), True) \
  ])
df = spark.read.csv("hdfs://namenode:9000/chess_2016_dataset/output/part*", schema=schema)

2023-03-30 12:51:32,534 INFO spark.SparkContext: Running Spark version 3.3.2
2023-03-30 12:51:33,251 INFO resource.ResourceUtils: No custom resources configured for spark.driver.
2023-03-30 12:51:33,252 INFO spark.SparkContext: Submitted application: test
2023-03-30 12:51:33,307 INFO resource.ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
2023-03-30 12:51:33,346 INFO resource.ResourceProfile: Limiting resource is cpus at 1 tasks per executor
2023-03-30 12:51:33,354 INFO resource.ResourceProfileManager: Added ResourceProfile id: 0
2023-03-30 12:51:33,458 INFO spark.SecurityManager: Changing view acls to: ubuntu
2023-03-30 12:51:33,459 INFO spark.SecurityManager: Changing modify acls to: ubuntu
2023-03-30 12:51:33,460 INFO spark.SecurityManage

In [2]:
spark

##### Check Shape of Data

In [3]:
print("shape: ", (df.count(), len(df.columns)))

2023-03-30 12:52:13,150 INFO datasources.FileSourceStrategy: Pushed Filters: 
2023-03-30 12:52:13,153 INFO datasources.FileSourceStrategy: Post-Scan Filters: 
2023-03-30 12:52:13,156 INFO datasources.FileSourceStrategy: Output Data Schema: struct<>
2023-03-30 12:52:14,295 INFO codegen.CodeGenerator: Code generated in 332.648369 ms
2023-03-30 12:52:14,387 INFO memory.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 485.1 KiB, free 365.8 MiB)
2023-03-30 12:52:14,541 INFO memory.MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 53.7 KiB, free 365.8 MiB)
2023-03-30 12:52:14,545 INFO storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on namenode:39703 (size: 53.7 KiB, free: 366.2 MiB)
2023-03-30 12:52:14,561 INFO spark.SparkContext: Created broadcast 0 from count at NativeMethodAccessorImpl.java:0
2023-03-30 12:52:14,615 INFO execution.FileSourceScanExec: Planning scan with bin packing, max size: 134217728 bytes, open cost is 

##### Convert columns to appropriate types

In [4]:
df = convert_types(df)
df.dtypes

[('Event', 'string'),
 ('White', 'string'),
 ('Black', 'string'),
 ('Result', 'string'),
 ('WhiteElo', 'int'),
 ('BlackElo', 'int'),
 ('Opening', 'string'),
 ('TimeControl', 'string'),
 ('Termination', 'string'),
 ('Moves', 'array<string>'),
 ('Eval', 'array<float>'),
 ('UTCTimestamp', 'timestamp')]

In [5]:
eval_difference = 3.0
eval_games = df.where(col("Eval")[0].isNotNull())
eval_games = eval_games.withColumn("WhiteBlunders", (find_white_blunders(col("Eval"), lit(eval_difference))))
eval_games = eval_games.withColumn("BlackBlunders", (find_black_blunders(col("Eval"), lit(eval_difference))))
eval_games.select("TimeControl", "White", "WhiteElo", "WhiteBlunders", "Black", "BlackElo", "BlackBlunders", "Result", "Termination") \
    .orderBy(col("WhiteBlunders").desc(), col("BlackBlunders").desc()).limit(10).toPandas().head(10)

2023-03-30 12:52:33,872 INFO datasources.FileSourceStrategy: Pushed Filters: IsNotNull(Eval)
2023-03-30 12:52:33,873 INFO datasources.FileSourceStrategy: Post-Scan Filters: isnotnull(Eval#10),isnotnull(cast(split(translate(Eval#10, ', ), ,, -1) as array<float>)[0])
2023-03-30 12:52:33,875 INFO datasources.FileSourceStrategy: Output Data Schema: struct<White: string, Black: string, Result: string, WhiteElo: int, BlackElo: int ... 6 more fields>
2023-03-30 12:52:33,975 INFO codegen.CodeGenerator: Code generated in 44.71062 ms
2023-03-30 12:52:34,035 INFO codegen.CodeGenerator: Code generated in 40.750894 ms
2023-03-30 12:52:34,225 INFO codegen.CodeGenerator: Code generated in 140.514741 ms
2023-03-30 12:52:34,250 INFO memory.MemoryStore: Block broadcast_3 stored as values in memory (estimated size 485.1 KiB, free 365.3 MiB)
2023-03-30 12:52:34,282 INFO memory.MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 53.7 KiB, free 365.2 MiB)
2023-03-30 12:52:34,283 

Unnamed: 0,TimeControl,White,WhiteElo,WhiteBlunders,Black,BlackElo,BlackBlunders,Result,Termination
0,300+5,dynamo21,1649,19,viviero,1705,14,1/2-1/2,Normal
1,30+0,broskall,2290,16,BitChess,2401,17,1-0,Time forfeit
2,900+0,ItCouldBeWorse,1917,16,Misak-Hay,1793,10,0-1,Normal
3,30+0,Melee,2007,16,miniond,2171,9,0-1,Time forfeit
4,180+0,JoseFranc,1814,15,SSUM,1906,15,0-1,Time forfeit
5,30+0,daBALYAN8,1853,15,Beetlebug,1674,12,1-0,Time forfeit
6,300+0,TulatovOleg,1962,15,bart2008,1555,8,0-1,Normal
7,240+2,antifragile,1428,15,AlexanderSupertramp,1393,4,0-1,Time forfeit
8,30+0,xxFUDOxx,1431,15,Bkmzbvfhr,1802,3,0-1,Time forfeit
9,60+0,mrkile,1635,14,chessrok2000,1340,12,1-0,Time forfeit


##### Plot Most Blundered Game

In [10]:
plot_eval_game(eval_games)

07,810 INFO scheduler.TaskSetManager: Finished task 10.0 in stage 11.0 (TID 95) in 4204 ms on datanode1 (executor 1) (13/16)
2023-03-30 13:12:11,273 INFO scheduler.TaskSetManager: Starting task 15.0 in stage 11.0 (TID 98) (datanode3, executor 2, partition 15, RACK_LOCAL, 4939 bytes) taskResourceAssignments Map()
2023-03-30 13:12:11,276 INFO scheduler.TaskSetManager: Finished task 11.0 in stage 11.0 (TID 96) in 4402 ms on datanode3 (executor 2) (14/16)
2023-03-30 13:12:12,466 INFO scheduler.TaskSetManager: Finished task 14.0 in stage 11.0 (TID 97) in 4664 ms on datanode1 (executor 1) (15/16)
2023-03-30 13:12:15,117 INFO scheduler.TaskSetManager: Finished task 15.0 in stage 11.0 (TID 98) in 3845 ms on datanode3 (executor 2) (16/16)
2023-03-30 13:12:15,118 INFO cluster.YarnScheduler: Removed TaskSet 11.0, whose tasks have all completed, from pool 
2023-03-30 13:12:15,120 INFO scheduler.DAGScheduler: ResultStage 11 (collect at /home/ubuntu/chess_project/helper.py:45) finished in 39.874 s
2

Py4JError: functions does not exist in the JVM

### Group By
- Timecontrol ~ (60, 120, 180, 600) etc...
- Elo-Brackets ~ ([1200, 1400], [1500, 1700], [2000-2200]) etc...
### --> ERLEND WORK HERE YOU SCUM <--

In [7]:
time_control_white_blunders_averages = eval_games.groupBy("TimeControl").agg(mean("WhiteBlunders"), count("TimeControl")).withColumn("avg(WhiteBlunders)", format_number("avg(WhiteBlunders)", 1))
time_control_black_blunders_averages = eval_games.groupBy("TimeControl").agg(mean("BlackBlunders"), count("TimeControl")).withColumn("avg(BlackBlunders)", format_number("avg(BlackBlunders)", 1))
time_control_white_blunders_averages.orderBy(col("avg(WhiteBlunders)").desc()).where(col("count(TimeControl)")>10000).limit(10).toPandas().head()

2023-03-30 12:53:20,031 INFO datasources.FileSourceStrategy: Pushed Filters: IsNotNull(Eval)
2023-03-30 12:53:20,032 INFO datasources.FileSourceStrategy: Post-Scan Filters: isnotnull(Eval#10),isnotnull(cast(split(translate(Eval#10, ', ), ,, -1) as array<float>)[0])
2023-03-30 12:53:20,033 INFO datasources.FileSourceStrategy: Output Data Schema: struct<TimeControl: string, Eval: string>
2023-03-30 12:53:20,365 INFO codegen.CodeGenerator: Code generated in 204.021701 ms
2023-03-30 12:53:20,452 INFO codegen.CodeGenerator: Code generated in 68.115475 ms
2023-03-30 12:53:20,462 INFO memory.MemoryStore: Block broadcast_5 stored as values in memory (estimated size 485.1 KiB, free 364.7 MiB)
2023-03-30 12:53:20,494 INFO memory.MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 53.7 KiB, free 364.6 MiB)
2023-03-30 12:53:20,497 INFO storage.BlockManagerInfo: Added broadcast_5_piece0 in memory on namenode:39703 (size: 53.7 KiB, free: 366.1 MiB)
2023-03-30 12:53:20,503

Unnamed: 0,TimeControl,avg(WhiteBlunders),count(TimeControl)


In [8]:
time_control_black_blunders_averages.orderBy(col("avg(BlackBlunders)").desc()).where(col("count(TimeControl)")>100000).limit(10).toPandas().head()

2023-03-30 12:54:02,390 INFO datasources.FileSourceStrategy: Pushed Filters: IsNotNull(Eval)
2023-03-30 12:54:02,391 INFO datasources.FileSourceStrategy: Post-Scan Filters: isnotnull(Eval#10),isnotnull(cast(split(translate(Eval#10, ', ), ,, -1) as array<float>)[0])
2023-03-30 12:54:02,392 INFO datasources.FileSourceStrategy: Output Data Schema: struct<TimeControl: string, Eval: string>
2023-03-30 12:54:02,572 INFO storage.BlockManagerInfo: Removed broadcast_4_piece0 on namenode:39703 in memory (size: 20.1 KiB, free: 366.1 MiB)
2023-03-30 12:54:02,580 INFO memory.MemoryStore: Block broadcast_8 stored as values in memory (estimated size 485.1 KiB, free 364.1 MiB)
2023-03-30 12:54:02,595 INFO storage.BlockManagerInfo: Removed broadcast_4_piece0 on datanode1:35681 in memory (size: 20.1 KiB, free: 366.1 MiB)
2023-03-30 12:54:02,612 INFO storage.BlockManagerInfo: Removed broadcast_4_piece0 on datanode3:42711 in memory (size: 20.1 KiB, free: 366.1 MiB)
2023-03-30 12:54:02,622 INFO memory.Memo

Unnamed: 0,TimeControl,avg(BlackBlunders),count(TimeControl)


In [9]:
a = eval_games.select("Moves") \
    .orderBy(col("WhiteBlunders").desc(), col("BlackBlunders").desc()).limit(1).take(1)[0][0]
print([x.replace("'","").replace('"', "").strip("'") for x in a])

2023-03-30 12:54:40,999 INFO datasources.FileSourceStrategy: Pushed Filters: IsNotNull(Eval)
2023-03-30 12:54:41,001 INFO datasources.FileSourceStrategy: Post-Scan Filters: isnotnull(Eval#10),isnotnull(cast(split(translate(Eval#10, ', ), ,, -1) as array<float>)[0])
2023-03-30 12:54:41,001 INFO datasources.FileSourceStrategy: Output Data Schema: struct<Moves: string, Eval: string>
2023-03-30 12:54:41,069 INFO codegen.CodeGenerator: Code generated in 43.037777 ms
2023-03-30 12:54:41,126 INFO codegen.CodeGenerator: Code generated in 42.395952 ms
2023-03-30 12:54:41,226 INFO codegen.CodeGenerator: Code generated in 78.985482 ms
2023-03-30 12:54:41,233 INFO memory.MemoryStore: Block broadcast_11 stored as values in memory (estimated size 485.1 KiB, free 363.6 MiB)
2023-03-30 12:54:41,260 INFO memory.MemoryStore: Block broadcast_11_piece0 stored as bytes in memory (estimated size 53.7 KiB, free 363.5 MiB)
2023-03-30 12:54:41,262 INFO storage.BlockManagerInfo: Added broadcast_11_piece0 in mem