In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Todo:
Read url links from scraper_one_results

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_ONE_Championship_events#Events"

soup = BeautifulSoup(requests.get(url).text,"html.parser")

table = soup.find_all("table", class_ = "sortable")[0]
table_rows = table.find_all("tr")[1:]

anchors = [_.find_all("a") for _ in table_rows if len(_.find_all("a"))>0]
cols = ["link", "name"]

links_df = pd.DataFrame([(anchor[0].get("href"), anchor[0].text) for anchor in anchors], columns=cols)


In [3]:
clean_link = lambda x: f"https://en.wikipedia.org/{x.split('#')[0]}"
links_df["link_clean"] = links_df["link"].apply(clean_link)
links = links_df["link_clean"].value_counts().reset_index().iloc[:,0]
links = links.to_list()
urls = list(set([link for link in links if "one_" in link.lower()]))

# Todo:
add function to extract event card from df

In [4]:
def cleanResults(result):
    event_name = result["event"]
    df = result["df"]
    df.columns = [_[1] for _ in df.columns]

    split_by_card = [(i, df.iloc[i,0]) for i in df[df.eq(df.iloc[:, 0], axis=0).all(axis=1)].index]
    if 0 not in [_[0] for _ in split_by_card]:
        split_by_card = [(0, "Main card"), *split_by_card]

    for i in range(len(split_by_card)-1):
        split_by_card[i] = (split_by_card[i][0], split_by_card[i+1][0], split_by_card[i][1])

    split_by_card[-1] = (split_by_card[-1][0], len(df), split_by_card[-1][1])

    sdfs = []
    for start, end, card in split_by_card:
        sdf = df.iloc[start:end, :]
        sdf = sdf.assign(fight_card = card)
        sdfs.append(sdf)
    
    df = pd.concat(sdfs).drop([i[0] for i in split_by_card[1:]]).reset_index(drop=True)

    cols2rename = {
        x: x.lower().replace(' ', '_') for x in df.columns
    }

    cols2rename = {
        **cols2rename,
        **{
            "Unnamed: 1_level_1": "winner",
            "Unnamed: 3_level_1": "loser",
        },
    }

    df.rename(columns=cols2rename, inplace=True)

    df.drop(columns=["unnamed:_2_level_1"], axis=1, inplace=True)

    df = df.assign(event_name = event_name)

    return df

In [11]:
def getData(url):
    dfs = []
    soup = BeautifulSoup(requests.get(url).text)
    header_patterns = ["one championship:", "road to one", "one on", "one fighting championship", "hero series", "warrior series"]
    pattern_match = lambda x: len([_ for _ in header_patterns if _ in x.lower()]) > 0

    # event_headers = soup.find_all(lambda tag: tag.name == "h2" and "one" in tag.text.lower() and "cancelled" not in tag.text.lower())
    event_headers = [header for header in soup.find_all(lambda tag: tag.name == "h2" and "cancelled" not in tag.text.lower()) if pattern_match(header.text)]

    if not event_headers:
        event_headers = soup.find_all(lambda tag: tag.name == "h1" and "one" in tag.text.lower() and "cancelled" not in tag.text.lower())
    
    table_classes = [_.get("class") for _ in event_headers[0].find_all_next("table")]

    table_class = "wikitable" if "toccolours" not in [x for xs in list(filter(lambda x: x!=None, table_classes)) for x in xs] else "toccolours"
    
    for event in event_headers:

        event_name = event.text.replace("[edit]", "")
        tables = event.find_next("table", class_ = table_class)
        if tables != None:
            df = pd.read_html(str(tables))
        else:
            df = pd.read_html(str(event.find_next("table", class_ = "wikitable")))
        
        df = df[0]
        df = cleanResults(result={"event": event_name, "df": df})
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

# Todo:
Convert section to pyspark

In [6]:
[url for url in urls if "2020" in url]

['https://en.wikipedia.org//wiki/2020_in_ONE_Championship']

In [12]:
getData('https://en.wikipedia.org//wiki/2020_in_ONE_Championship')['event_name'].unique()

array(['ONE Championship: A New Tomorrow',
       'ONE Championship: Fire & Fury',
       "ONE Championship: Warrior's Code", 'ONE Warrior Series 10',
       'ONE Championship: King of the Jungle', 'ONE Hero Series 13',
       'ONE Hero Series 14', 'ONE Championship: No Surrender',
       'ONE Championship: No Surrender 2',
       'ONE Championship: No Surrender 3',
       'ONE Championship: A New Breed',
       'Road to ONE 3: Tokyo Fight Night',
       'ONE Championship: A New Breed 2',
       'ONE Championship: A New Breed 3',
       'ONE Championship: Reign of Dynasties',
       'ONE Championship: Reign of Dynasties 2',
       'ONE Championship: Inside the Matrix',
       'ONE Championship: Inside the Matrix 2',
       'ONE Championship: Inside the Matrix 3',
       'ONE Championship: Inside the Matrix 4',
       'Road to ONE 4: Fair Fight 13', 'ONE Championship: Big Bang',
       'ONE Championship: Big Bang 2', 'Road to ONE 5: WSS',
       'ONE Championship: Collision Course',
   

In [25]:
df = pd.concat([getData(url) for url in urls], ignore_index=True)

In [30]:
df

Unnamed: 0,weight_class,winner,loser,method,round,time,notes,fight_card,event_name,event
0,Middleweight,Reinier de Ridder (c),Vitaly Bigdash,Technical Submission (inverted triangle choke),1,3:29,[a],Main card,ONE 159,
1,Women's Atomweight Muay Thai,Janet Todd,Lara Fernandez,Decision (unanimous),5,3:00,[b],Main card,ONE 159,
2,Bantamweight Muay Thai,Muangthai P.K.Saenchai,Vladimir Kuzmin,Decision (split),3,3:00,,Main card,ONE 159,
3,Strawweight Kickboxing,Zhang Peimian,Aslanbek Zikreev,Decision (unanimous),3,3:00,,Main card,ONE 159,
4,Featherweight Muay Thai,Jamal Yusupov,Jo Nattawut,Decision (unanimous),3,3:00,,Main card,ONE 159,
...,...,...,...,...,...,...,...,...,...,...
2269,Women's Flyweight,Irina Mazepa,Ana Julaton,Decision (unanimous),3,5:00,,Main card,ONE Championship: Spirit of Champions,
2270,Women's Strawweight,Angela Lee,Lena Tkhorevska,Submission (rear-naked choke),2,3:26,,Main card,ONE Championship: Spirit of Champions,
2271,Featherweight,Bruno Pucci,Anthony Engelen,Submission (rear-naked choke),2,2:19,,Main card,ONE Championship: Spirit of Champions,
2272,Flyweight,Alex Silva,Ruel Catalan,Submission (kneebar),1,3:53,,Main card,ONE Championship: Spirit of Champions,


In [52]:
raw_df = pd.read_csv(r"C:\Development\ultimateNakMuay\data\raw\wiki_events_onefc.csv")

In [100]:
event_exists = lambda x: (
    (x.replace("[edit]", "") in raw_df["Event"].to_list()) or
    (x.replace("[edit]", "").replace(" Championship", "") in raw_df["Event"].to_list()) or
    (x.split(":")[-1].replace("[edit]", "").strip() in [_.split(":")[-1].strip() for _ in raw_df["Event"].to_list()]) or
    (x.split(":")[0].replace("[edit]", "").strip() in [_.split(":")[0].strip() for _ in raw_df["Event"].to_list()])
)

In [101]:
set([event for event in df["event_name"].to_list() if event_exists(event) == False])

set()

In [99]:
raw_df[raw_df["Event"].str.contains("ONE Fight Night 8")]

Unnamed: 0,#,Event,Date,Venue,Location,Attendance
17,236,ONE Fight Night 8: Superlek vs. Williams,"March 25, 2023",Singapore Indoor Stadium,"Kallang, Singapore",
