In [11]:
# Import Libraries #
import findspark
findspark.init()
from pyspark.sql.functions import split, col, array_contains, translate, round, size, when, udf, lit, mean, count, format_number, collect_list
from pyspark.sql.types import TimestampType, MapType, IntegerType, StringType, ArrayType, FloatType, StructField, StructType
from pyspark.sql import SparkSession
from helper import *

In [12]:
spark = SparkSession.builder.appName('test').master("yarn") \
.config("spark.executor.instances", 9) \
.config("spark.executor.memory", "1G")  \
.getOrCreate()

In [13]:
## MUTE OUTPUT FROM SPARK
logger = spark._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.OFF)
logger.LogManager.getLogger("akka").setLevel(logger.Level.OFF)
spark.conf.set("spark.driver.log.level", "OFF")

In [14]:
spark

In [15]:
#Event, White, Black, Result, WhiteElo, BlackElo, Opening, TimeControl, Termination, Moves,Eval, UTCTimestamp
schema = StructType([ \
    StructField("Event",StringType(),True), \
    StructField("White",StringType(),True), \
    StructField("Black",StringType(),True), \
    StructField("Result", StringType(), True), \
    StructField("WhiteElo", IntegerType(), True), \
    StructField("BlackElo", IntegerType(), True), \
    StructField("Opening",StringType(),True), \
    StructField("TimeControl",StringType(),True), \
    StructField("Termination",StringType(),True), \
    StructField("Moves", StringType(), True), \
    StructField("Eval", StringType(), True), \
    StructField("UTCTimestamp", TimestampType(), True) \
  ])
df = spark.read.csv("hdfs://namenode:9000/chess_2016_dataset/output/part*", schema=schema)

##### Check Shape of Data

In [16]:
print("shape: ", (df.count(), len(df.columns)))



shape:  (3113065, 12)


                                                                                

##### Convert columns to appropriate types

In [17]:
df = convert_types(df)
df.dtypes

[('Event', 'string'),
 ('White', 'string'),
 ('Black', 'string'),
 ('Result', 'string'),
 ('WhiteElo', 'int'),
 ('BlackElo', 'int'),
 ('Opening', 'string'),
 ('TimeControl', 'string'),
 ('Termination', 'string'),
 ('Moves', 'array<string>'),
 ('Eval', 'array<float>'),
 ('UTCTimestamp', 'timestamp')]

In [18]:
eval_difference = 3.0
eval_games = df.where(col("Eval")[0].isNotNull())
eval_games = eval_games.withColumn("WhiteBlunders", (find_white_blunders(col("Eval"), lit(eval_difference))))
eval_games = eval_games.withColumn("BlackBlunders", (find_black_blunders(col("Eval"), lit(eval_difference))))
eval_games.select("TimeControl", "White", "WhiteElo", "WhiteBlunders", "Black", "BlackElo", "BlackBlunders", "Result", "Termination") \
    .orderBy(col("WhiteBlunders").desc(), col("BlackBlunders").desc()).limit(10).toPandas().head(10)

                                                                                

Unnamed: 0,TimeControl,White,WhiteElo,WhiteBlunders,Black,BlackElo,BlackBlunders,Result,Termination
0,300+5,dynamo21,1649,19,viviero,1705,14,1/2-1/2,Normal
1,30+0,broskall,2290,16,BitChess,2401,17,1-0,Time forfeit
2,900+0,ItCouldBeWorse,1917,16,Misak-Hay,1793,10,0-1,Normal
3,30+0,Melee,2007,16,miniond,2171,9,0-1,Time forfeit
4,180+0,JoseFranc,1814,15,SSUM,1906,15,0-1,Time forfeit
5,30+0,daBALYAN8,1853,15,Beetlebug,1674,12,1-0,Time forfeit
6,300+0,TulatovOleg,1962,15,bart2008,1555,8,0-1,Normal
7,240+2,antifragile,1428,15,AlexanderSupertramp,1393,4,0-1,Time forfeit
8,30+0,xxFUDOxx,1431,15,Bkmzbvfhr,1802,3,0-1,Time forfeit
9,60+0,mrkile,1635,14,chessrok2000,1340,12,1-0,Time forfeit


##### Plot Most Blundered Game

In [19]:
plot_eval_game(eval_games)

ERROR:root:KeyboardInterrupt while sending command.===>           (16 + 4) / 20]
Traceback (most recent call last):
  File "/home/ubuntu/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ubuntu/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

KeyboardInterrupt: 

                                                                                

### Group By
- Time control ~ (60, 120, 180, 600) etc...
- Elo-Brackets ~ ([1200, 1400], [1500, 1700], [2000-2200]) etc...

#### Time Control Grouping

In [None]:
time_control_white_blunders_averages = eval_games \
    .groupBy("TimeControl") \
    .agg(mean("WhiteBlunders"), count("TimeControl")) \
    .withColumn("avg(WhiteBlunders)", format_number("avg(WhiteBlunders)", 1))

time_control_black_blunders_averages = eval_games \
    .groupBy("TimeControl") \
    .agg(mean("BlackBlunders"), count("TimeControl")) \
    .withColumn("avg(BlackBlunders)", format_number("avg(BlackBlunders)", 1))


In [None]:
time_control_white_blunders_averages \
    .orderBy(col("avg(WhiteBlunders)").desc()) \
    .where(col("count(TimeControl)")>1000) \
    .limit(10) \
    .toPandas() \
    .head()

In [None]:
time_control_black_blunders_averages \
    .orderBy(col("avg(BlackBlunders)").desc()) \
    .where(col("count(TimeControl)")>1000) \
    .limit(10) \
    .toPandas() \
    .head()

#### Elo Brackets Grouping

Start off by Creating a new spark daraframe column called "EloBracket" which we will later use to group and aggregrate by. When grouping the players by elo brackets we want to use a range that makes sense such that there are not 1 bracket that contains 80% of the playerbase and ones that only contain a small fraction. E.g We want evenly distributed amount of players in each bracket (as far as that is possible).

In [None]:
# Start by getting all the elo column values in the dataframe.
elo_list = eval_games.select(collect_list("WhiteElo")).first()[0]
sns.set_theme(style="ticks")
sns.set_style('darkgrid')
sns.distplot(elo_list, kde=True, color ='green', bins=20)

"EloBracket" column should be of type String and contain values in format: "0-1200", "1200-1600", "1600-2000", "2000-3000"

In [None]:
eval_games = eval_games.withColumn("EloBracket", \
                         when((0 < eval_games.WhiteElo) & (eval_games.WhiteElo < 1500), lit("<1500")) \
                        .when((1500 <= eval_games.WhiteElo) & (eval_games.WhiteElo <= 1750), lit("1500-1750")) \
                        .when((1750 < eval_games.WhiteElo) & (eval_games.WhiteElo <= 2000), lit("1751-2000")) \
                        .otherwise(lit(">2000")))

In [None]:
elo_bracket_white_blunders_averages = eval_games \
    .groupBy("EloBracket") \
    .agg(mean("WhiteBlunders"), count("EloBracket")) \
    .withColumn("avg(WhiteBlunders)", format_number("avg(WhiteBlunders)", 1))

elo_bracket_black_blunders_averages = eval_games \
    .groupBy("EloBracket") \
    .agg(mean("BlackBlunders"), count("EloBracket")) \
    .withColumn("avg(BlackBlunders)", format_number("avg(BlackBlunders)", 1))

In [None]:
elo_bracket_white_blunders_averages.orderBy(col("avg(WhiteBlunders)").desc()).limit(10).toPandas().head()

In [None]:
elo_bracket_black_blunders_averages.orderBy(col("avg(BlackBlunders)").desc()).limit(10).toPandas().head()

In [None]:
a = eval_games.select("Moves") \
    .orderBy(col("WhiteBlunders").desc(), col("BlackBlunders").desc()) \
    .limit(1) \
    .take(1)[0][0]
print([x.replace("'","").replace('"', "").strip("'") for x in a])

## BIG HEAVY FOOKIN WORK

### Move List to FEN notation
ta den lista me moves som input, lag en FEN string.


In [None]:
rows = eval_games.select("Moves").take(1)
moves = rows[0].Moves
moves

### First moves
kan bruk d me elo og result for å se ka som e vanlig for good players

In [None]:
rows = eval_games.select("Result", "WhiteElo", "BlackElo", "Moves").take(100)

first_moves = []
for row in rows: 
    move = row.Moves[:2]
    first_move = [row.Result, row.WhiteElo, row.BlackElo, move]
    first_moves.append(first_move)

first_moves