In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F

In [5]:
def show(df, limit = 5):
    return df.toPandas().head(limit)

In [3]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [47]:
events = spark.read.csv(r"C:\Development\ultimateNakMuay\data\processed\wiki_events_ufc.csv", header=True)

In [48]:
events = events.withColumn("date", F.to_date(F.col("date")))\
    .withColumn("attendance", F.regexp_replace(F.col("attendance"), ",", ""))\
    .withColumn("attendance", F.col("attendance").cast(T.IntegerType()))\
    .withColumn("location", F.when(F.col("location") == "—", F.lit(None)).otherwise(F.col("location")))

In [49]:
events = events.withColumn(
    "location", F.when(
        F.col("location").endswith("U.S"), F.regexp_replace(F.col("location"), "U.S", "U.S.")
        ).otherwise(F.col("location"))
        )

In [50]:
venue_map = events.filter(F.col("location").isNotNull())\
    .select("venue", "location").distinct()

In [57]:
venue_map = venue_map.withColumn("venue", F.when((F.col("location") == "Hidalgo, Texas, U.S.") & (F.col("venue") == "State Farm Arena"),
                                     F.lit("Payne Arena")).otherwise(F.col("venue"))
                                     ).withColumnRenamed("location", "location_filled")


In [59]:
events = events.join(venue_map, on=["venue"], how="left")\
    .withColumn("location", F.col("location_filled"))\
    .drop("location_filled")

In [62]:
events = events.withColumn("location", F.split(F.col("location"), ","))\
    .withColumn("city", F.when(F.size(F.col("location")) == 3, F.element_at(F.col("location"), 1)))\
    .withColumn("state", F.when(
        F.size(F.col("location")) == 3, F.element_at(F.col("location"), 2)
        ).otherwise(F.element_at(F.col("location"), 1))
        )\
    .withColumn("country", F.element_at(F.col("location"), -1))\
    .withColumn("country", F.regexp_replace(F.col("country"), "[^a-zA-Z0-9 ]", ""))\
    .drop("location")

In [63]:
show(events)

Unnamed: 0,venue,event_num,event,date,attendance,event_id,city,state,country
0,VyStar Veterans Memorial Arena,652,UFC on ABC: Emmett vs. Topuria,2023-06-24,,f796c71c34c1c47ec8d9ac26c5f015d2f0c36f4c1a6e1d...,Jacksonville,Florida,US
1,UFC Apex,651,UFC on ESPN: Vettori vs. Cannonier,2023-06-17,,b9d858650af3b4603d1f73dd9d46d337644d942d3b3c62...,Las Vegas,Nevada,US
2,Rogers Arena,650,UFC 289: Nunes vs. Aldana,2023-06-10,17628.0,a8b3628f1a194459323a285a9e55aaa74035164ad6967a...,Vancouver,British Columbia,Canada
3,UFC Apex,649,UFC on ESPN: Kara-France vs. Albazi,2023-06-03,,a8b4dda55b9d768282cdbfddcf6edd34b8539d638d9cad...,Las Vegas,Nevada,US
4,UFC Apex,648,UFC Fight Night: Dern vs. Hill,2023-05-20,,d8efd546a501350524b7b0987463472ddc45083f561753...,Las Vegas,Nevada,US


In [64]:
cols = ["event_num", "event", "date", "venue", "city", "state", "country", "attendance", "event_id"]

events = events.select(*cols)

In [65]:
show(events)

Unnamed: 0,event_num,event,date,venue,city,state,country,attendance,event_id
0,652,UFC on ABC: Emmett vs. Topuria,2023-06-24,VyStar Veterans Memorial Arena,Jacksonville,Florida,US,,f796c71c34c1c47ec8d9ac26c5f015d2f0c36f4c1a6e1d...
1,651,UFC on ESPN: Vettori vs. Cannonier,2023-06-17,UFC Apex,Las Vegas,Nevada,US,,b9d858650af3b4603d1f73dd9d46d337644d942d3b3c62...
2,650,UFC 289: Nunes vs. Aldana,2023-06-10,Rogers Arena,Vancouver,British Columbia,Canada,17628.0,a8b3628f1a194459323a285a9e55aaa74035164ad6967a...
3,649,UFC on ESPN: Kara-France vs. Albazi,2023-06-03,UFC Apex,Las Vegas,Nevada,US,,a8b4dda55b9d768282cdbfddcf6edd34b8539d638d9cad...
4,648,UFC Fight Night: Dern vs. Hill,2023-05-20,UFC Apex,Las Vegas,Nevada,US,,d8efd546a501350524b7b0987463472ddc45083f561753...
