In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import os
import re

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
def find_csv_files(folder_path):
    csv_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

In [4]:
def show(df, limit=10):
    return df.toPandas().head(limit)

In [5]:
def roman_to_int(roman):
    roman_mapping = {
        'I': 1,
        'II': 2,
        'III': 3,
        'IV': 4,
        'V': 5,
        'VI': 6,
        'VII': 7,
        'VIII': 8,
        'IX': 9,
        'X': 10
    }
    pattern = r'\b(I{1,3}|IV|V|VI{0,3}|IX|X)\b'
    
    def replace(match):
        return str(roman_mapping[match.group(0)])
    
    return re.sub(pattern, replace, roman)

In [6]:
folder = r"C:\Development\ultimateNakMuay\data\processed"
files = find_csv_files(folder)

In [7]:
files = files[2:]

In [8]:
organizations = ["ufc", "onefc", "bellator", "glory"]

organizations = {
    k:{
        "event": spark.read.csv(list(filter(lambda x: "event" in x and k in x, files))[0], header=True),
        "result": spark.read.csv(list(filter(lambda x: "result" in x and k in x, files))[0], header=True),
        }
    for k in organizations
}

In [9]:
def get_data(org):
    return organizations[org]["event"], organizations[org]["result"]

In [39]:
event, result = get_data("onefc")

In [40]:
event = event.withColumn("ejc1", F.element_at(F.split(F.col("event"), ":"), 1))\
    .withColumn("event", F.regexp_replace(F.col("event"), "ONE:", "ONE Championship:"))\
    .withColumn("ejc4", F.udf(lambda x: roman_to_int(x), T.StringType())(F.col("event")))\
    .withColumn("ejc2", F.lower(F.trim(F.element_at(F.split(F.col("event"), ":"), -1))))\
    .withColumn(
        "ejc3", F.when(
            F.col("event").contains("Hero Series"),
            F.concat(
                F.regexp_extract(F.col("event"), r"^(ONE Hero Series)", 1),
                F.lit(" "),
                F.date_format(F.col("date"), "MMMM")
                )
            ).otherwise(F.lit(None))
        )

In [41]:
result = result.withColumn("event_name", F.regexp_replace(F.col("event_name"), "Road to ONE:", "Road to ONE Championship:"))\
    .withColumn("rjc1", F.lower(F.trim(F.element_at(F.split(F.col("event_name"), ":"), -1))))

In [42]:
manual_event_name_corrections = [
    ("2011 in ONE Championship", "ONE Fighting Championship 1: Champion vs. Champion"),
    ("Road to ONE Championship: Night of Warriors 17","Road to ONE 8: Night of Warriors"),    
]

for _ in manual_event_name_corrections:
    result = result.withColumn("event_name", F.regexp_replace(F.col("event_name"), _[0], _[1]))

In [43]:
join_conditions = (
    (event.event == result.event_name)
    |(event.ejc1 == result.event_name)
    |(event.ejc2 == result.rjc1)
    |(event.ejc4 == result.event_name)
    )
dbg = event.select("event", "ejc1", "ejc2", "ejc3", "ejc4").distinct().join(
    result.select("event_name", "rjc1").distinct(), join_conditions, "full_outer"
)

In [44]:
hsdf = dbg.filter(F.col("ejc3").isNotNull() & F.col("event_name").isNull())
dbg = dbg.subtract(hsdf).filter(~(F.col("ejc3").isNull() & F.col("event_name").contains("Hero Series")))

hsdf = hsdf.select("event", "ejc1", "ejc2", "ejc3", "ejc4")

hsdf = hsdf.join(result.select("event_name", "rjc1").distinct(), hsdf.ejc3 == result.event_name, "left")

dbg = dbg.unionByName(hsdf)

In [45]:
df = event.join(result, on=join_conditions, how="left")

hsdf = df.filter(F.col("ejc3").isNotNull() & F.col("event_name").isNull())
df = df.subtract(hsdf)
hsdf = hsdf.select(*event.columns)

hsdf = hsdf.join(result, hsdf.ejc3 == result.event_name, "left")

df = df.unionByName(hsdf)

In [57]:
df = df.withColumn("event_num", F.regexp_replace(F.col("event_num"), "–", "cancelled"))

In [58]:
show(df\
    .filter(F.col("event_name").isNull())\
    .select("event_num", "event", "date")\
    .distinct()\
    .orderBy(F.col("event").asc()))

Unnamed: 0,event_num,event,date
0,cancelled,ONE Championship: Battle for the Ages,2020-06-05
1,cancelled,ONE Championship: Dreams,2020-05-01
2,cancelled,ONE Championship: Dynasty of Champions (Shanghai),2015-09-17
3,cancelled,ONE Championship: Heart of Heroes,2020-06-26
4,cancelled,ONE Championship: Hope,2020-04-17
5,cancelled,ONE Championship: Infinity 1,2020-04-10
6,cancelled,ONE Championship: Infinity 2,2020-06-19
7,cancelled,ONE Championship: Inspiration,2020-05-08
8,cancelled,ONE Championship: Legendary Warriors,2020-06-19
9,cancelled,ONE Championship: Strength,2020-04-24
