In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
def show(df, limit = 5):
    return df.toPandas().head(limit)

# Events

In [8]:
events = spark.read.csv(r"C:\Development\ultimateNakMuay\data\raw\wiki_events_glory.csv", header=True)

In [10]:
events = events.withColumn("date", F.to_date(F.col("date"), "MMMM d, yyyy"))\
    .withColumn("attendance", F.regexp_replace(F.col("attendance"), ",", ""))\
    .withColumn("attendance", F.col("attendance").cast(T.IntegerType()))\
    .withColumn("location", F.when(F.col("location") == "—", F.lit(None)).otherwise(F.col("location")))

In [13]:
events = events.withColumn("location", F.split(F.col("location"), ","))\
    .withColumn("city", F.when(F.size(F.col("location")) == 3, F.element_at(F.col("location"), 1)))\
    .withColumn("state", F.when(
        F.size(F.col("location")) == 3, F.element_at(F.col("location"), 2)
        ).otherwise(F.element_at(F.col("location"), 1))
        )\
    .withColumn("country", F.element_at(F.col("location"), -1))\
    .withColumn("country", F.regexp_replace(F.col("country"), "[^a-zA-Z0-9 ]", ""))\
    .drop("location")

In [17]:
events = events.withColumnRenamed("#", "event_num")
for _ in events.columns:
    events = events.withColumnRenamed(_, _.lower())

cols = ["event_num", "event", "date", "venue", "city", "state", "country", "attendance"]

events = events.select(*cols)

In [18]:
show(events.filter(F.col("attendance").isNotNull()))

Unnamed: 0,event_num,event,date,venue,city,state,country,attendance
0,82,Glory: Collision 2,2019-12-21,GelreDome,,Arnhem,Netherlands,31000
1,49,Glory 41: Holland,2017-05-20,Brabanthallen,,Den Bosch,Netherlands,5000
2,48,Glory 40: Copenhagen,2017-04-29,Forum Copenhagen,,Copenhagen,Denmark,2500
3,47,Glory 39: Brussels,2017-03-25,Vorst National,,Brussels,Belgium,1500
4,43,Glory 36: Oberhausen,2016-12-10,König Pilsener Arena,,Oberhausen,Germany,13000


# Results

In [4]:
results = spark.read.csv(r"C:\Development\ultimateNakMuay\data\raw\wiki_results_glory.csv", header=True)

In [7]:
results = results.withColumn("weight_class", F.coalesce("weight_class", "weight")).drop("weight")

In [9]:
def remove_poisoned_rows(df):
    hash_rows = lambda col_list: F.sha2(F.concat_ws("|", *col_list), 256)
    test_cols = [F.col(_) for _ in df.columns[:7]]

    incorrect_data = df.filter(
        ~(F.col("time").contains(":") | F.col("time").contains("."))
        ).select(*test_cols)\
        .distinct()
    
    incorrect_data = incorrect_data.withColumn("poison", hash_rows(incorrect_data.columns))

    df = df.withColumn("poison", hash_rows(test_cols))\
        .join(incorrect_data, ["poison"], how="left_anti")\
        .drop("poison")
    
    return df

In [11]:
results = remove_poisoned_rows(results)

In [15]:
results = results.withColumn("time", F.regexp_replace(F.col("time"), "\\.", ":"))\
    .withColumn("time_parts", F.split(F.col("time"), ":"))\
    .withColumn("time", F.element_at(F.col("time_parts"), 1)*60 + F.element_at(F.col("time_parts"), 2))\
    .drop("time_parts")

In [17]:
results = results.withColumn("time", F.col("time").cast(T.DoubleType()))\
    .withColumn("round", F.col("round").cast(T.IntegerType()))

In [18]:
show(results)

Unnamed: 0,weight_class,winner,loser,method,round,time,notes,fight_card,event_name,link
0,Middleweight 85 kg,Jason Wilnis (c),Israel Adesanya,Decision (Unanimous),5.0,180.0,For the Glory Middleweight Championship,Main card,Glory 37: Los Angeles,https://en.wikipedia.org//wiki/2017_in_Glory
1,Welterweight 77 kg,Yoann Kongolo,Karim Benmansour,KO (Punches),3.0,177.0,Welterweight Contender Tournament Final,Main card,Glory 37: Los Angeles,https://en.wikipedia.org//wiki/2017_in_Glory
2,Heavyweight 120 kg,Guto Inocente,D'Angelo Marshall,Decision (Extra Round),4.0,180.0,,Main card,Glory 37: Los Angeles,https://en.wikipedia.org//wiki/2017_in_Glory
3,Welterweight 77 kg,Karim Benmansour,Alan Scheinson,Decision (Split),3.0,180.0,Welterweight Contender Tournament Semi-Finals,Main card,Glory 37: Los Angeles,https://en.wikipedia.org//wiki/2017_in_Glory
4,Welterweight 77 kg,Yoann Kongolo,Konstantin Khuzin,Decision (Unanimous),3.0,180.0,Welterweight Contender Tournament Semi-Finals,Main card,Glory 37: Los Angeles,https://en.wikipedia.org//wiki/2017_in_Glory
