In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F

In [3]:
def show(df, limit = 5):
    return df.toPandas().head(limit)

In [4]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [5]:
results = spark.read.csv(r"C:\Development\ultimateNakMuay\data\processed\wiki_fc_ufc.csv", header=True)

In [6]:
def remove_poisoned_rows(df):
    hash_rows = lambda col_list: F.sha2(F.concat_ws("|", *col_list), 256)
    test_cols = [F.col(_) for _ in df.columns[:7]]

    incorrect_data = df.filter(
        ~(F.col("time").contains(":") | F.col("time").contains("."))
        ).select(*test_cols)\
        .distinct()
    
    incorrect_data = incorrect_data.withColumn("poison", hash_rows(incorrect_data.columns))

    df = df.withColumn("poison", hash_rows(test_cols))\
        .join(incorrect_data, ["poison"], how="left_anti")\
        .drop("poison")
    
    return df

In [7]:
results = remove_poisoned_rows(results)

In [8]:
results = results.withColumn("time", F.regexp_replace(F.col("time"), "\\.", ":"))\
    .withColumn("time_parts", F.split(F.col("time"), ":"))\
    .withColumn("time", F.element_at(F.col("time_parts"), 1)*60 + F.element_at(F.col("time_parts"), 2))\
    .drop("time_parts")

In [9]:
results = results.withColumn("time", F.col("time").cast(T.DoubleType()))\
    .withColumn("round", F.col("round").cast(T.IntegerType()))

In [10]:
results = results.drop("notes")

In [11]:
show(results)

Unnamed: 0,weight_class,winner,loser,method,round,time,fight_card,event_name,event_id
0,Heavyweight,Sergei Pavlovich,Curtis Blaydes,TKO (punches),1.0,188.0,Main card,UFC Fight Night: Pavlovich vs. Blaydes,58cb1f65abe0d87d636582d45edbb984fe4c92044d67d3...
1,Middleweight,Bruno Silva,Brad Tavares,TKO (knee and punch),1.0,215.0,Main card,UFC Fight Night: Pavlovich vs. Blaydes,58cb1f65abe0d87d636582d45edbb984fe4c92044d67d3...
2,Lightweight,Bobby Green,Jared Gordon,No Contest (accidental clash of heads),1.0,275.0,Main card,UFC Fight Night: Pavlovich vs. Blaydes,58cb1f65abe0d87d636582d45edbb984fe4c92044d67d3...
3,Women's Flyweight,Iasmin Lucindo,Brogan Walker-Sanchez,"Decision (unanimous) (30–27, 30–27, 30–27)",3.0,300.0,Main card,UFC Fight Night: Pavlovich vs. Blaydes,58cb1f65abe0d87d636582d45edbb984fe4c92044d67d3...
4,Welterweight,Jeremiah Wells,Matthew Semelsberger,"Decision (split) (30–27, 28–29, 30–27)",3.0,300.0,Main card,UFC Fight Night: Pavlovich vs. Blaydes,58cb1f65abe0d87d636582d45edbb984fe4c92044d67d3...
