In [190]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [31]:
def show(df, limit=5):
    return df.toPandas().head(limit)

# Events

In [None]:
events = spark.read.csv(r"C:\Development\ultimateNakMuay\data\raw\wiki_events_bellator.csv", header=True)

In [56]:
for col in events.columns:
    events = events.withColumnRenamed(col, col.lower())

In [57]:
# schema cleaning
events = events\
    .withColumn("date", F.to_date(F.col("date"), "MMMM d, yyyy"))\
    .withColumn("attendance", F.col("attendance").cast(T.IntegerType()))
    
# split location to city, state, country
events = events.withColumn("location", F.split(F.col("location"), ","))\
    .withColumn("city", F.when(F.size(F.col("location")) == 3, F.element_at(F.col("location"), 1)))\
    .withColumn("state", F.when(
        F.size(F.col("location")) == 3, F.element_at(F.col("location"), 2)
        ).otherwise(F.element_at(F.col("location"), 1))
        )\
    .withColumn("country", F.element_at(F.col("location"), -1))\
    .withColumn("country", F.regexp_replace(F.col("country"), "[^a-zA-Z0-9 ]", ""))\
    .drop("location")

In [58]:
show(events.filter(F.col("attendance").isNotNull()))

Unnamed: 0,#,event,date,venue,attendance,city,state,country
0,288,Bellator 284,2022-08-12,Sanford Pentagon,2900,Sioux Falls,South Dakota,US
1,287,Bellator 283,2022-07-22,Emerald Queen Casino and Hotel,1012,Tacoma,Washington,US
2,286,Bellator 282,2022-06-24,Mohegan Sun Arena,10000,Uncasville,Connecticut,US
3,285,Bellator 281,2022-05-13,SSE Arena,8779,,London,England
4,284,Bellator 280,2022-05-06,AccorHotels Arena,13131,,Paris,France


# Results

In [167]:
results = spark.read.csv(r"C:\Development\ultimateNakMuay\data\raw\wiki_results_bellator.csv", header=True)

In [168]:
hash_rows = lambda col_list: F.sha2(F.concat_ws("|", *col_list), 256)

In [169]:
# remove poisoned rows

test_cols = [F.col(_) for _ in results.columns[:7]]

incorrect_data = results.filter(
    ~(F.col("time").contains(":") | F.col("time").contains("."))
    ).select(*test_cols)\
    .distinct()
    
incorrect_data = incorrect_data.withColumn("poison", hash_rows(incorrect_data.columns))

In [170]:
results = results.withColumn("poison", hash_rows(test_cols))\
    .join(incorrect_data, ["poison"], how="left_anti")\
    .drop("poison")

In [171]:
results = results.withColumn("time", F.regexp_replace(F.col("time"), "\\.", ":"))\
    .withColumn("time_parts", F.split(F.col("time"), ":"))\
    .withColumn("time", F.element_at(F.col("time_parts"), 1)*60 + F.element_at(F.col("time_parts"), 2))\
    .drop("time_parts")

In [172]:
results = results.withColumn("time", F.col("time").cast(T.DoubleType()))\
    .withColumn("round", F.col("round").cast(T.IntegerType()))

In [173]:
results = results.drop("notes")

In [192]:
isinstance(results.toPandas(), DataFrame)

False