In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_Bellator_MMA_events"

soup = BeautifulSoup(
    requests.get(url).text,
    "html.parser"
)

In [3]:
table = soup.find_all("table", class_ = "sortable")[0]
table_rows = table.find_all("tr")[1:]
anchors = [_.find_all("a") for _ in table_rows if len(_.find_all("a"))>0]
cols = ["link", "name"]

In [4]:
links_df = pd.DataFrame([(anchor[0].get("href"), anchor[0].text) for anchor in anchors], columns=cols)
clean_link = lambda x: f"https://en.wikipedia.org/{x.split('#')[0]}"
links_df["link_clean"] = links_df["link"].apply(clean_link)
links = links_df["link_clean"].value_counts().reset_index().iloc[:,0]
links = links.to_list()

In [5]:
def cleanResults(result):
    event_name = result["event"]
    df = result["df"]
    df.columns = [_[1] for _ in df.columns]

    split_by_card = [(i, df.iloc[i,0]) for i in df[df.eq(df.iloc[:, 0], axis=0).all(axis=1)].index]
    if 0 not in [_[0] for _ in split_by_card]:
        split_by_card = [(0, "Main card"), *split_by_card]

    for i in range(len(split_by_card)-1):
        split_by_card[i] = (split_by_card[i][0], split_by_card[i+1][0], split_by_card[i][1])

    split_by_card[-1] = (split_by_card[-1][0], len(df), split_by_card[-1][1])

    sdfs = []
    for start, end, card in split_by_card:
        sdf = df.iloc[start:end, :]
        sdf = sdf.assign(fight_card = card)
        sdfs.append(sdf)
    
    df = pd.concat(sdfs).drop([i[0] for i in split_by_card[1:]]).reset_index(drop=True)

    cols2rename = {
        x: x.lower().replace(' ', '_') for x in df.columns
    }

    cols2rename = {
        **cols2rename,
        **{
            "Unnamed: 1_level_1": "winner",
            "Unnamed: 3_level_1": "loser",
        },
    }

    df.rename(columns=cols2rename, inplace=True)

    df.drop(columns=["unnamed:_2_level_1"], axis=1, inplace=True)

    df = df.assign(event_name = event_name)

    return df

In [18]:
def getData(url):
    dfs = []
    soup = BeautifulSoup(requests.get(url).text)
    header_patterns = ["bellator"]
    pattern_match = lambda x: len([_ for _ in header_patterns if _ in x.lower()]) > 0

    # event_headers = soup.find_all(lambda tag: tag.name == "h2" and "one" in tag.text.lower() and "cancelled" not in tag.text.lower())
    event_headers = [
        header for header in soup.find_all(
        lambda tag: 
        tag.name == "h2" and "cancelled" not in tag.text.lower() and "tournament" not in tag.text.lower()
        ) if pattern_match(header.text)
        ]

    if not event_headers:
        event_headers = soup.find_all(lambda tag: tag.name == "h1" and "bellator" in tag.text.lower() and "cancelled" not in tag.text.lower())
    
    table_classes = [_.get("class") for _ in event_headers[0].find_all_next("table")]

    table_class = "wikitable" if "toccolours" not in [x for xs in list(filter(lambda x: x!=None, table_classes)) for x in xs] else "toccolours"
    
    for event in event_headers:

        event_name = event.text.replace("[edit]", "")
        tables = event.find_next("table", class_ = table_class)
        if tables != None:
            df = pd.read_html(str(tables))
        else:
            df = pd.read_html(str(event.find_next("table", class_ = "wikitable")))
        
        df = df[0]
        df = cleanResults(result={"event": event_name, "df": df})
        df = df.assign(link = url)
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

In [20]:
dfs = []
failed_links = []

for link in links:
    try:
        dfs.append(getData(link))
    except:
        failed_links.append(link)

In [23]:
df = pd.concat(dfs, ignore_index=True)

In [28]:
links_df = links_df.drop("link", axis=1).rename(columns={"link_clean": "link"})

In [32]:
df.head()

Unnamed: 0,weight_class,winner,loser,method,round,time,notes,fight_card,event_name,link
0,Heavyweight,Ryan Bader,Fedor Emelianenko,TKO (punches),1,0:35,[a],Main card,Bellator 214,https://en.wikipedia.org//wiki/Bellator_MMA_in...
1,Featherweight,Henry Corrales,Aaron Pico,KO (punches),1,1:07,,Main card,Bellator 214,https://en.wikipedia.org//wiki/Bellator_MMA_in...
2,Heavyweight,Jake Hager,J.W. Kiser,Submission (arm-triangle choke),1,2:09,,Main card,Bellator 214,https://en.wikipedia.org//wiki/Bellator_MMA_in...
3,Bantamweight,Juan Archuleta,Ricky Bandejas,"Decision (unanimous) (29–28, 29–28, 29–28)",3,5:00,,Main card,Bellator 214,https://en.wikipedia.org//wiki/Bellator_MMA_in...
4,Featherweight,Adel Altamimi,Brandon McMahan,Submission (armbar),1,1:16,,Main card,Bellator 214,https://en.wikipedia.org//wiki/Bellator_MMA_in...


In [34]:
links_df.merge(df, on="link")

Unnamed: 0,name,link,weight_class,winner,loser,method,round,time,notes,fight_card,event_name
0,Bellator 299,https://en.wikipedia.org//wiki/Bellator_299,Middleweight,Johnny Eblen (c),Fabian Edwards,,,,[a],Main card,Bellator 299
1,Bellator 299,https://en.wikipedia.org//wiki/Bellator_299,Featherweight,Aaron Pico,Pedro Carvalho,,,,,Main card,Bellator 299
2,Bellator 299,https://en.wikipedia.org//wiki/Bellator_299,Featherweight,Brian Moore,Otto Rodrigues,,,,,Main card,Bellator 299
3,Bellator 299,https://en.wikipedia.org//wiki/Bellator_299,Featherweight,Mads Burnell,Daniel Weichel,,,,,Main card,Bellator 299
4,Bellator 299,https://en.wikipedia.org//wiki/Bellator_299,Women's Featherweight,Sinead Kavanagh,Sara Collins,,,,,Main card,Bellator 299
...,...,...,...,...,...,...,...,...,...,...,...
60145,Bellator 1,https://en.wikipedia.org//wiki/Bellator_1,Featherweight,Fabio Mello,Sami Aziz,Submission (armbar),3,1:58,,Non-tournament bouts,Bellator 12
60146,Bellator 1,https://en.wikipedia.org//wiki/Bellator_1,Welterweight,Sérgio Moraes,Josh Martin,Submission (triangle choke),1,4:21,,Non-tournament bouts,Bellator 12
60147,Bellator 1,https://en.wikipedia.org//wiki/Bellator_1,Women's bout (125 lb),Rosi Sexton,Valerie Coolbaugh,Submission (armbar),1,3:40,,Non-tournament bouts,Bellator 12
60148,Bellator 1,https://en.wikipedia.org//wiki/Bellator_1,Catchweight (151.5 lbs),Luis Palomino,Troy Gerhart,Decision (unanimous),3,5:00,,Non-tournament bouts,Bellator 12
