In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import os

In [2]:
def find_csv_files(folder_path):
    csv_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

In [3]:
folder = r"C:\Development\ultimateNakMuay\data\raw"
files = find_csv_files(folder)

In [4]:
events = [filepath for filepath in files if "event" in filepath]
results = [filepath for filepath in files if "result" in filepath]

In [5]:
results.append(r"C:\Development\ultimateNakMuay\data\processed\wiki_fc_ufc.csv")

In [6]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [7]:
events = [spark.read.csv(file, header=True) for file in events]

In [8]:
for i, df in enumerate(events):
    print(i, df.columns)

0 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance']
1 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance']
2 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance']
3 ['#', 'Event', 'Date', 'Venue', 'City']
4 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance', 'Ref.']


In [9]:
results = [spark.read.csv(file, header=True) for file in results]

In [10]:
for i, df in enumerate(results):
    print(i, df.columns)

0 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'notes', 'fight_card', 'event_name', 'link']
1 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'notes', 'fight_card', 'event_name', 'link', 'weight']
2 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'notes', 'fight_card', 'event_name', 'event']
3 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'notes', 'fight_card', 'event_name', 'event_id']


In [15]:
events[4].toPandas()

Unnamed: 0,#,Event,Date,Venue,Location,Attendance,Ref.
0,652,UFC on ABC: Emmett vs. Topuria,"Jun 24, 2023",VyStar Veterans Memorial Arena,"Jacksonville, Florida, U.S.",,
1,651,UFC on ESPN: Vettori vs. Cannonier,"Jun 17, 2023",UFC Apex,"Las Vegas, Nevada, U.S.",—,[24]
2,650,UFC 289: Nunes vs. Aldana,"Jun 10, 2023",Rogers Arena,"Vancouver, British Columbia, Canada",17628,[25]
3,649,UFC on ESPN: Kara-France vs. Albazi,"Jun 3, 2023",UFC Apex,"Las Vegas, Nevada, U.S.",—,[26]
4,648,UFC Fight Night: Dern vs. Hill,"May 20, 2023",[27],,,
...,...,...,...,...,...,...,...
674,,UFC 291: Poirier vs. Gaethje 2,"Jul 29, 2023",Delta Center,"Salt Lake City, Utah, U.S.",,[19]
675,,UFC Fight Night: Aspinall vs. Tybura,"Jul 22, 2023",The O2 Arena,"London, England, UK",,[20]
676,,UFC Fight Night: Holm vs. Bueno Silva,"Jul 15, 2023",UFC Apex,"Las Vegas, Nevada, U.S.",,[21]
677,,UFC 290: Volkanovski vs. Rodríguez,"Jul 8, 2023",T-Mobile Arena,[22],,


In [11]:
results[1]\
    .filter(F.col("weight").isNotNull())\
    .toPandas()\
    .head(5)

Unnamed: 0,weight_class,winner,loser,method,round,time,notes,fight_card,event_name,link,weight
0,,Kento Haraguchi,Serhiy Adamchuk,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Featherweight 65 kg
1,,Kaito,Stoyan Koprivlenski,Decision (Split),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Lightweight 70 kg
2,,Petpanomrung Kiatmuu9,Kosei Yamada,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Featherweight 65 kg
3,,Taiju Shiratori,Ilias Banniss,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Featherweight 65 kg
4,,Chadd Collins,Hiroki Kasahara,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Catchweight 63.5 kg


glory: coalesce wight and weight class

In [15]:
events[2].describe()

DataFrame[summary: string, #: string, Event: string, Date: string, Venue: string, Location: string, Attendance: string]

In [16]:
results[0].describe()

DataFrame[summary: string, weight_class: string, winner: string, loser: string, method: string, round: string, time: string, notes: string, fight_card: string, event_name: string, link: string]

cast schema