In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import os

In [2]:
def find_csv_files(folder_path):
    csv_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

In [3]:
folder = r"C:\Development\ultimateNakMuay\data\raw"
files = find_csv_files(folder)

In [4]:
events = [filepath for filepath in files if "event" in filepath]
results = [filepath for filepath in files if "result" in filepath]

In [5]:
results.append(r"C:\Development\ultimateNakMuay\data\processed\wiki_fc_ufc.csv")

In [6]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [7]:
events = [spark.read.csv(file, header=True) for file in events]

In [8]:
for i, df in enumerate(events):
    print(i, df.columns)

0 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance']
1 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance']
2 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance']
3 ['#', 'Event', 'Date', 'Venue', 'City']
4 ['#', 'Event', 'Date', 'Venue', 'Location', 'Attendance', 'Ref.']


In [9]:
results = [spark.read.csv(file, header=True) for file in results]

In [10]:
for i, df in enumerate(results):
    print(i, df.columns)

0 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'notes', 'fight_card', 'event_name', 'link']
1 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'notes', 'fight_card', 'event_name', 'link', 'weight']
2 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'notes', 'fight_card', 'event_name', 'event']
3 ['weight_class', 'winner', 'loser', 'method', 'round', 'time', 'fight_card', 'event_name', 'event_id']


In [11]:
events[3].toPandas()

Unnamed: 0,#,Event,Date,Venue,City
0,90,THAI FIGHT Luk Luang Phor Sothorn,"June 18, 2023",Community Dome at Wat Saman Rattanaram Hospital,"Chachoengsao province, Thailand"
1,89,THAI FIGHT 100 Years Rajabhat Korat,"May 21, 2023",Nakhon Ratchasima Rajabhat University,"Nakhon Ratchasima, Thailand"
2,88,THAI FIGHT Rome 2,"April 22, 2023",Palacesaroni,"Rome, Italy"
3,87,THAI FIGHT KonLakPathum 2,"February 5, 2023",Jumbo Market,"Pathum Thani, Thailand"
4,86,THAI FIGHT Luang Phor Ruay,"February 5, 2023",Courtyard beside Wat Cherng Khao,"Saraburi, Thailand"
...,...,...,...,...,...
86,5,THAI FIGHT Extreme 2011: Hong Kong,"July 17, 2011",AsiaWorld Arena,"Hong Kong, SAR, China"
87,4,THAI FIGHT Extreme 2011: France,"May 14, 2011",Palm Beach Cannes,"Cannes, France"
88,3,THAI FIGHT 2010: Competition for the Champions...,"December 6, 2010",80th Birthday Stadium,"Nakhon Ratchasima, Thailand"
89,2,THAI FIGHT 2010: 2nd Round,"October 23, 2010",Indoor Stadium Huamark,"Bangkok, Thailand"


In [11]:
results[1]\
    .filter(F.col("weight").isNotNull())\
    .toPandas()\
    .head(5)

Unnamed: 0,weight_class,winner,loser,method,round,time,notes,fight_card,event_name,link,weight
0,,Kento Haraguchi,Serhiy Adamchuk,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Featherweight 65 kg
1,,Kaito,Stoyan Koprivlenski,Decision (Split),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Lightweight 70 kg
2,,Petpanomrung Kiatmuu9,Kosei Yamada,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Featherweight 65 kg
3,,Taiju Shiratori,Ilias Banniss,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Featherweight 65 kg
4,,Chadd Collins,Hiroki Kasahara,Decision (Unanimous),3,3:00,,Main card,Glory Rivals 4,https://en.wikipedia.org//wiki/2022_in_Glory,Catchweight 63.5 kg


glory: coalesce wight and weight class

In [15]:
events[2].describe()

DataFrame[summary: string, #: string, Event: string, Date: string, Venue: string, Location: string, Attendance: string]

In [16]:
results[0].describe()

DataFrame[summary: string, weight_class: string, winner: string, loser: string, method: string, round: string, time: string, notes: string, fight_card: string, event_name: string, link: string]

cast schema