In [17]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import os

In [18]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [19]:
def find_csv_files(folder_path):
    csv_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

In [20]:
def show(df, limit=10):
    return df.toPandas().head(limit)

In [21]:
folder = r"C:\Development\ultimateNakMuay\data\processed"
files = find_csv_files(folder)

In [22]:
files = files[2:]

In [33]:
organizations = ["ufc", "onefc", "bellator", "glory"]

organizations = {
    k:{
        "event": spark.read.csv(list(filter(lambda x: "event" in x and k in x, files))[0], header=True),
        "result": spark.read.csv(list(filter(lambda x: "result" in x and k in x, files))[0], header=True),
        }
    for k in organizations
}

In [34]:
def get_data(org):
    return organizations[org]["event"], organizations[org]["result"]

In [35]:
event, result = get_data("bellator")

In [41]:
event = event.withColumn("#", F.col("#").cast(T.IntegerType())).fillna("cancelled", subset=["#"])

In [59]:
make_ejc1 = F.concat(F.lit("Bellator "), F.element_at(F.split(F.col("event"), "/"), -1))
make_ejc2 = F.element_at(F.split(F.col("event"), ":"), 1)
event = event.withColumn("ejc1", make_ejc1).withColumn("ejc2", make_ejc2)

In [60]:
result = result.withColumn("rjc1", F.element_at(F.split(F.col("event_name"), "/"), -1))\
    .withColumn("rjc2", F.element_at(F.split(F.col("event_name"), "/"), 1))

In [62]:
join_conditions = (
    (event.event == result.event_name)
    | (event.ejc1 == result.rjc1)
    | (event.ejc2 == result.event_name)
    | (event.event == result.rjc2)
)

df = event.join(result, join_conditions, "left")

In [64]:
cancelled_events = df.filter(F.col("#").isNull())
df = df.subtract(cancelled_events)

cancelled_events = cancelled_events.withColumn("#", F.lit("cancelled"))
df = df.unionByName(cancelled_events)

In [66]:
df = df.withColumnRenamed("#", "event_num")

In [67]:
show(df)

Unnamed: 0,event_num,event,date,venue,attendance,city,state,country,ejc1,ejc2,...,winner,loser,method,round,time,fight_card,event_name,link,rjc1,rjc2
0,304,Bellator 298,2023-08-11,Sanford Pentagon,,Sioux Falls,South Dakota,US,Bellator Bellator 298,Bellator 298,...,Sullivan Cauley,Cedric Savage,,,,Preliminary card (Youtube/Pluto TV),Bellator 298,https://en.wikipedia.org//wiki/Bellator_298,Bellator 298,Bellator 298
1,303,Bellator MMA x Rizin 2,2023-07-30,Saitama Super Arena,,,Saitama,Japan,Bellator Bellator MMA x Rizin 2,Bellator MMA x Rizin 2,...,A. J. McKee,Patricky Pitbull,,,,Main card,Bellator MMA x Rizin 2,https://en.wikipedia.org//wiki/Bellator_MMA_x_...,Bellator MMA x Rizin 2,Bellator MMA x Rizin 2
2,302,Bellator 297,2023-06-16,Wintrust Arena,,Chicago,Illinois,US,Bellator Bellator 297,Bellator 297,...,Ramazan Kuramagomedov,Jaleel Willis,KO (knee and punches),1.0,84.0,Preliminary card (Youtube/Pluto TV),Bellator 297,https://en.wikipedia.org//wiki/Bellator_297,Bellator 297,Bellator 297
3,283,Bellator 279,2022-04-23,Neal S. Blaisdell Arena,6516.0,Honolulu,Hawaii,US,Bellator Bellator 279,Bellator 279,...,Patchy Mix,Kyoji Horiguchi,"Decision (unanimous) (48–47, 48–47, 48–47)",5.0,300.0,Main card,Bellator 279,https://en.wikipedia.org//wiki/Bellator_279,Bellator 279,Bellator 279
4,265,Bellator 261,2021-06-25,Mohegan Sun Arena,9112.0,Uncasville,Connecticut,US,Bellator Bellator 261,Bellator 261,...,John de Jesus,John Macapá,"Decision (split) (28–29, 29–28, 29–28)",3.0,300.0,Preliminary card (Youtube/Pluto TV),Bellator 261,https://en.wikipedia.org//wiki/Bellator_261,Bellator 261,Bellator 261
5,262,Bellator 258,2021-05-07,Mohegan Sun Arena,0.0,Uncasville,Connecticut,US,Bellator Bellator 258,Bellator 258,...,Anthony Johnson,José Augusto Azevedo,KO (punch),2.0,90.0,Main card,Bellator 258,https://en.wikipedia.org//wiki/Bellator_258,Bellator 258,Bellator 258
6,238,Bellator 235,2019-12-20,Neal S. Blaisdell Arena,,Honolulu,Hawaii,US,Bellator Bellator 235,Bellator 235,...,Toby Misech,Erik Perez,KO (punches),1.0,54.0,Main card,Bellator 235,https://en.wikipedia.org//wiki/Bellator_MMA_in...,Bellator 235,Bellator 235
7,194,Bellator 194,2018-02-16,Mohegan Sun Arena,,Uncasville,Connecticut,US,Bellator Bellator 194,Bellator 194,...,Marcus Surin,Dean Hancock,Submission (arm-triangle choke),2.0,290.0,Preliminary card (Paramount Network.com),Bellator 194,https://en.wikipedia.org//wiki/Bellator_MMA_in...,Bellator 194,Bellator 194
8,182,Bellator 182,2017-08-25,Turning Stone Resort & Casino,4390.0,Verona,New York,US,Bellator Bellator 182,Bellator 182,...,Joey Davis,Justin Roswell,TKO (punches),1.0,95.0,Preliminary card (Spike.com),Bellator 182,https://en.wikipedia.org//wiki/Bellator_MMA_in...,Bellator 182,Bellator 182
9,181,Bellator 181,2017-07-14,WinStar World Casino,1247.0,Thackerville,Oklahoma,US,Bellator Bellator 181,Bellator 181,...,Derek Campos,Brandon Girtz,TKO (doctor stoppage),2.0,300.0,Main card,Bellator 181,https://en.wikipedia.org//wiki/Bellator_MMA_in...,Bellator 181,Bellator 181
