In [33]:
import pickle
from typing import Dict, List, Tuple
import sys
import requests
from bs4 import BeautifulSoup

In [46]:
import os
from pathlib import Path

BASE_PATH = Path("D:/code/frankie_edgar_stan_zone") / "data"
FIGHT_LINKS_PICKLE = BASE_PATH / "fight_links.pickle"
PAST_EVENT_LINKS_PICKLE = BASE_PATH / "past_event_links.pickle"
PAST_FIGHTER_LINKS_PICKLE = BASE_PATH / "past_fighter_links.pickle"
SCRAPED_FIGHTER_DATA_DICT_PICKLE = BASE_PATH / "scraped_fighter_data_dict.pickle"
NEW_FIGHTS_DATA_PATH = BASE_PATH / "new_fight_data.csv"
TOTAL_FIGHTS_DATA_PATH = BASE_PATH / "raw_total_fight_data.csv"
PREPROCESSED_DATA_PATH = BASE_PATH / "preprocessed_data.csv"
FIGHTER_DETAILS_DATA_PATH = BASE_PATH / "raw_fighter_details.csv"
UFC_DATA_PATH = BASE_PATH / "data.csv"


In [42]:
def make_soup(url: str) -> BeautifulSoup:
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    return BeautifulSoup(plain_text, "html.parser")

In [43]:
def print_progress(
    iteration: int,
    total: int,
    prefix: str = "",
    suffix: str = "",
    decimals: int = 1,
    bar_length: int = 50,
) -> None:
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        bar_length  - Optional  : character length of bar (Int)
    """
    percents = f"{100 * (iteration / float(total)):.2f}"
    filled_length = int(round(bar_length * iteration / float(total)))
    bar = f'{"█" * filled_length}{"-" * (bar_length - filled_length)}'

    sys.stdout.write(f"\r{prefix} |{bar}| {percents}% {suffix}")

    if iteration == total:
        sys.stdout.write("\n")
    sys.stdout.flush()


In [58]:


class UFCLinks:
    def __init__(
        self, all_events_url="http://ufcstats.com/statistics/events/completed?page=all"
    ):
        self.all_events_url = all_events_url
        self.PAST_EVENT_LINKS_PICKLE_PATH = PAST_EVENT_LINKS_PICKLE
        self.FIGHT_LINKS_PICKLE_PATH = FIGHT_LINKS_PICKLE
        self.new_event_links, self.all_event_links = self._get_updated_event_links()

    def _get_updated_event_links(self) -> Tuple[List[str], List[str]]:
        all_event_links = []
        print("Getting all event URLs")
        soup = make_soup(self.all_events_url)
        # could pull title text too
        for link in soup.findAll("td", {"class": "b-statistics__table-col"}):
            for href in link.findAll("a"):
                foo = href.get("href")
                all_event_links.append(foo)

        if not self.PAST_EVENT_LINKS_PICKLE_PATH.exists():
            # if no past event links are present, set empty list
            past_event_links = []
        else:
            # get past event links
            with open(self.PAST_EVENT_LINKS_PICKLE_PATH.as_posix(), "rb") as pickle_in:
                past_event_links = pickle.load(pickle_in)

        # set new events to be all events not in past event link file.
        new_event_links = list(set(all_event_links) - set(past_event_links))

        # dump all_event_links as PAST_EVENT_LINKS
        with open(self.PAST_EVENT_LINKS_PICKLE_PATH.as_posix(), "wb") as f:
            pickle.dump(all_event_links, f)

        return new_event_links, all_event_links

    def get_fight_links(self) -> tuple[Dict, Dict]:
        def get_fight_links_from_events(event_links: List[str]) -> Dict[str, List[str]]:
            fight_links = {}

            num_events = len(event_links)
            print("Scraping fight links: ")
            print_progress(0, num_events, prefix="Progress:", suffix="Complete")

            for index, link in enumerate(event_links):
                event_fights = []
                soup = make_soup(link)
                for row in soup.findAll(
                    "tr",
                    {
                        "class": "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"
                    },
                ):
                    href = row.get("data-link")
                    event_fights.append(href)
                fight_links[link] = event_fights

                print_progress(
                    index + 1, num_events, prefix="Progress:", suffix="Complete"
                )

            return fight_links

        new_fight_links = {}
        # if event/fight link pickle file exists.
        if self.FIGHT_LINKS_PICKLE_PATH.exists():
            print(
                f"Loading previous fight data URLs from {self.FIGHT_LINKS_PICKLE_PATH}"
            )
            # load prev events and links
            with open(
                self.FIGHT_LINKS_PICKLE_PATH.as_posix(), "rb"
            ) as pickle_in:
                prev_fight_links = pickle.load(pickle_in)

            # if no new event links
            if not self.new_event_links:
                print("No new event URLs.")
                # then prev events are all events
                all_fight_links = prev_fight_links
            else:
                # get new fight URLs
                print("Getting URLs to fights from new events.")
                new_fight_links = get_fight_links_from_events(self.new_event_links)
                # add to all events
                all_fight_links = (
                    new_fight_links | prev_fight_links
                )
                # update file
                print(f"Updating {self.FIGHT_LINKS_PICKLE_PATH.as_posix()}")
                with open(self.FIGHT_LINKS_PICKLE_PATH.as_posix(), "wb") as f:
                    pickle.dump(all_fight_links, f)
        else:
            # no event and fight link file exists
            print("No fight data URLs saved. Retrieving all URLs.")
            all_fight_links = get_fight_links_from_events(self.all_event_links)
            # all events are new events
            new_fight_links = all_fight_links
            print(
                f"Writing fight URLs to {self.FIGHT_LINKS_PICKLE_PATH.as_posix()}"
            )
            with open(self.FIGHT_LINKS_PICKLE_PATH.as_posix(), "wb") as f:
                pickle.dump(all_fight_links, f)

        return new_fight_links, all_fight_links


In [59]:
ufc_links = UFCLinks()

Getting all event URLs


In [60]:
new_fight_links, all_fight_links = (
    ufc_links.get_fight_links()
)

No URLs to fight data saved. Retrieving all URLs.
Scraping event and fight links: 
Progress: |██████████████████████████████████████████████████| 100.00% Complete
Writing fight and event URLs to D:/code/frankie_edgar_stan_zone/data/fight_links.pickle


In [67]:
len(all_events_and_fight_links['http://ufcstats.com/event-details/39f68882def7a507'])

13

In [8]:
import os
import concurrent.futures
from typing import Dict, List

import pandas as pd
from bs4 import BeautifulSoup

# from src.createdata.scrape_fight_links import UFCLinks
# from src.createdata.utils import make_soup, print_progress

# from src.createdata.data_files_path import (  # isort:skip
#     NEW_EVENT_AND_FIGHTS,
#     TOTAL_EVENT_AND_FIGHTS,
# )


class FightDataScraper:
    def __init__(self):
        self.HEADER: str = (
            "R_fighter;B_fighter;R_KD;B_KD;R_SIG_STR.;B_SIG_STR.\
;R_SIG_STR_pct;B_SIG_STR_pct;R_TOTAL_STR.;B_TOTAL_STR.;R_TD;B_TD;R_TD_pct\
;B_TD_pct;R_SUB_ATT;B_SUB_ATT;R_REV;B_REV;R_CTRL;B_CTRL;R_HEAD;B_HEAD;R_BODY\
;B_BODY;R_LEG;B_LEG;R_DISTANCE;B_DISTANCE;R_CLINCH;B_CLINCH;R_GROUND;B_GROUND\
;win_by;last_round;last_round_time;Format;Referee;date;location;Fight_type;Winner\n"
        )

        self.NEW_FIGHTS_DATA_PATH = NEW_FIGHTS_DATA_PATH
        self.TOTAL_FIGHTS_DATA_PATH = TOTAL_FIGHTS_DATA_PATH

    def create_fight_data_csv(self) -> None:
        print("Scraping links!")

        ufc_links = UFCLinks()
        new_fight_links, all_fight_links = (
            ufc_links.get_fight_links()
        )
        print("Successfully scraped and saved fight links!\n")
        print("Now, scraping fight data!\n")

        # are there new fight links to scrap data from?
        if not new_fight_links:
            # if there's no new fight links
            if self.TOTAL_FIGHTS_DATA_PATH.exists():
                # if fight data csv file exists.

                # assume fight data up to date
                # this is not actually necessarily true
                # but good enough for now
                print(
                    f"""No new fight data to scrape.
                        {self.TOTAL_EVENT_AND_FIGHTS_PATH} up to date."""
                )
                return None
            else:
                # if no data csv, scrape all fights and make it.
                self._scrape_raw_fight_data(
                    all_fight_links,
                    filepath=self.TOTAL_FIGHTS_PATH,
                )
        else:
            # scrape only fights from new events
            self._scrape_raw_fight_data(
                new_fight_links, filepath=self.NEW_EVENT_AND_FIGHTS_PATH
            )

            new__fights_data = pd.read_csv(self.NEW_FIGHTS_PATH)
            old_fights_data = pd.read_csv(self.TOTAL_FIGHTS_PATH)

            # verify same column count
            assert len(new_fights_data.columns) == len(
                old_fights_data.columns
            )

            # restricts new event cols to those with labels of old events/ensures same col order
            # feels like merging new/old fight data should be a seperate method
            new_fights_data = new_fights_data[list(old_fights_data.columns)]

            # might be worth verifying integrity here
            latest_total_fight_data = pd.concat(
                [new_fights_data, old_fights_data],
                axis=1,
                ignore_index=True,
            )

            latest_total_fight_data.to_csv(self.TOTAL_FIGHTS_PATH, index=None)
            print(f"Updated {self.TOTAL_FIGHTS_PATH} with new fight data")
            os.remove(self.NEW_EVENT_AND_FIGHTS_PATH)
            print("Removed temporary files.")

        print("Successfully scraped and saved UFC fight data!")

    def _scrape_raw_fight_data(
        self, event_and_fight_links: Dict[str, List[str]], filepath
    ):
        if filepath.exists():
            print(f"File {filepath} already exists, overwriting.")

        total_stats = self._get_total_fight_stats(event_and_fight_links)
        with open(filepath.as_posix(), "wb") as file:
            file.write(bytes(self.HEADER, encoding="ascii", errors="ignore"))
            file.write(bytes(total_stats, encoding="ascii", errors="ignore"))

    def _get_fight_stats_task(self, fight, event_info):
        total_fight_stats = ""
        try:
            fight_soup = make_soup(fight)
            fight_stats = self._get_fight_stats(fight_soup)
            fight_details = self._get_fight_details(fight_soup)
            result_data = self._get_fight_result_data(fight_soup)
            total_fight_stats = (
                fight_stats + ";" + fight_details + ";" + event_info + ";" + result_data
            )
        except Exception as e:
            print("Error getting fight stats, " + str(e))
            pass

        return total_fight_stats

    def _get_total_fight_stats(self, event_and_fight_links: Dict[str, List[str]]) -> str:
        total_stats = ""

        fight_count = len(event_and_fight_links)
        print(f"Scraping data for {fight_count} fights: ")
        print_progress(0, fight_count, prefix="Progress:", suffix="Complete")

        for index, (event, fights) in enumerate(event_and_fight_links.items()):
            event_soup = make_soup(event)
            event_info = self._get_event_info(event_soup)

            # Get data for each fight in the event in parallel.
            with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
                futures = []
                for fight in fights:
                    futures.append(
                        executor.submit(
                            self._get_fight_stats_task,
                            fight=fight,
                            event_info=event_info,
                        )
                    )
                for future in concurrent.futures.as_completed(futures):
                    fight_stats = future.result()
                    if fight_stats != "":
                        if total_stats == "":
                            total_stats = fight_stats
                        else:
                            total_stats = total_stats + "\n" + fight_stats
                    print_progress(index + 1, fight_count, prefix="Progress:", suffix="Complete")

        return total_stats

    def _get_fight_stats(self, fight_soup: BeautifulSoup) -> str:
        tables = fight_soup.findAll("tbody")
        # hard coded to grab totals and significant strike stats.
        # skips per round stats
        # i think we want per round stats.
        total_fight_data = [tables[0], tables[2]]
        fight_stats = []
        for table in total_fight_data:
            row = table.find("tr")
            stats = ""
            for data in row.findAll("td"):
                if stats == "":
                    stats = data.text
                else:
                    stats = stats + "," + data.text
            fight_stats.append(
                stats.replace("  ", "")
                .replace("\n\n", "")
                .replace("\n", ",")
                .replace(", ", ",")
                .replace(" ,", ",")
            )

        #hardcoded here to ignore first 3 cols of significant strikes table
        fight_stats[1] = ";".join(fight_stats[1].split(",")[6:])
        fight_stats[0] = ";".join(fight_stats[0].split(","))
        fight_stats = ";".join(fight_stats)
        return fight_stats

    def _get_fight_details(self, fight_soup: BeautifulSoup) -> str:
        columns = ""
        for div in fight_soup.findAll("div", {"class": "b-fight-details__content"}):
            for col in div.findAll("p", {"class": "b-fight-details__text"}):
                if columns == "":
                    columns = col.text
                else:
                    columns = columns + "," + (col.text)

        columns = (
            columns.replace("  ", "")
            .replace("\n\n\n\n", ",")
            .replace("\n", "")
            .replace(", ", ",")
            .replace(" ,", ",")
            .replace("Method: ", "")
            .replace("Round:", "")
            .replace("Time:", "")
            .replace("Time format:", "")
            .replace("Referee:", "")
        )

        fight_details = ";".join(columns.split(",")[:5])

        return fight_details


    def _get_event_info(self, event_soup: BeautifulSoup) -> str:
        event_info = ""
        for info in event_soup.findAll("li", {"class": "b-list__box-list-item"}):
            if event_info == "":
                event_info = info.text
            else:
                event_info = event_info + ";" + info.text

        event_info = ";".join(
            event_info.replace("Date:", "")
            .replace("Location:", "")
            .replace("Attendance:", "")
            .replace("\n", "")
            .replace("  ", "")
            .split(";")[:2]
        )

        return event_info

    def _get_fight_result_data(self, fight_soup: BeautifulSoup) -> str:
        winner = ""
        for div in fight_soup.findAll("div", {"class": "b-fight-details__person"}):
            if (
                div.find(
                    "i",
                    {
                        "class": "b-fight-details__person-status b-fight-details__person-status_style_green"
                    },
                )
                is not None
            ):
                winner = (
                    div.find("h3", {"class": "b-fight-details__person-name"})
                    .text.replace(" \n", "")
                    .replace("\n", "")
                )

        fight_type = (
            fight_soup.find("i", {"class": "b-fight-details__fight-title"})
            .text.replace("  ", "")
            .replace("\n", "")
        )

        return fight_type + ";" + winner


In [9]:
event_soup = make_soup('http://ufcstats.com/event-details/39f68882def7a507')


In [49]:
scraper=FightDataScraper()

In [50]:
event_info = scraper._get_event_info(event_soup)

In [51]:
scraper._get_fight_stats_task(fight='http://ufcstats.com/fight-details/daef1691c7d6b1e4', event_info=event_info)

'Islam Makhachev;Renato Moicano;0;0;6 of 19;9 of 27;31%;33%;18 of 31;18 of 37;1 of 2;0 of 0;50%;---;1;0;0;0;1:27;0:00;5 of 17;2 of 18;1 of 2;3 of 5;0 of 0;4 of 4;5 of 18;9 of 27;0 of 0;0 of 0;1 of 1;0 of 0;Submission;1;4:05;5 Rnd (5-5-5-5-5);Herb Dean;January 18, 2025;Inglewood, California, USA;UFC Lightweight Title Bout;Islam Makhachev'

In [None]:
def _get_fight_stats(self, fight_soup: BeautifulSoup) -> str:
        tables = fight_soup.findAll("tbody")
        # hard coded to grab totals and significant strike stats.
        # skips per round stats
        # i think we want per round stats.
        total_fight_data = [tables[0], tables[2]]
        fight_stats = []
        for table in total_fight_data:
            row = table.find("tr")
            stats = ""
            for data in row.findAll("td"):
                if stats == "":
                    stats = data.text
                else:
                    stats = stats + "," + data.text
            fight_stats.append(
                stats.replace("  ", "")
                .replace("\n\n", "")
                .replace("\n", ",")
                .replace(", ", ",")
                .replace(" ,", ",")
            )

        #hardcoded here to ignore first 3 cols of significant strikes table
        fight_stats[1] = ";".join(fight_stats[1].split(",")[6:])
        fight_stats[0] = ";".join(fight_stats[0].split(","))
        fight_stats = ";".join(fight_stats)
        return fight_stats

In [62]:
fight_soup = make_soup('http://ufcstats.com/fight-details/f46308108eb9261a')

In [72]:
test=pd.read_html('http://ufcstats.com/fight-details/f46308108eb9261a', header=0)

In [73]:
test[0]

Unnamed: 0,Fighter,KD,Sig. str.,Sig. str. %,Total str.,Td,Td %,Sub. att,Rev.,Ctrl
0,Mackenzie Dern Amanda Ribas,0 0,27 of 46 17 of 56,58% 30%,82 of 107 99 of 151,1 of 3 2 of 2,33% 100%,1 0,2 0,4:34 5:47


In [74]:
test[1]

Unnamed: 0,Fighter,KD,Sig. str.,Sig. str. %,Total str.,Td %,Td %.1,Sub. att,Rev.,Ctrl
0,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1
1,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2
2,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3
3,Mackenzie Dern Amanda Ribas,0 0,12 of 25 10 of 33,48% 30%,34 of 47 27 of 50,1 of 1 0 of 0,100% ---,0 0,0 0,2:36 0:00
4,Mackenzie Dern Amanda Ribas,0 0,3 of 3 3 of 6,100% 50%,24 of 25 37 of 49,0 of 1 1 of 1,0% 100%,0 0,1 0,0:19 3:31
5,Mackenzie Dern Amanda Ribas,0 0,12 of 18 4 of 17,66% 23%,24 of 35 35 of 52,0 of 1 1 of 1,0% 100%,1 0,1 0,1:39 2:16


In [75]:
test[2]

Unnamed: 0,Fighter,Sig. str,Sig. str. %,Head,Body,Leg,Distance,Clinch,Ground
0,Mackenzie Dern Amanda Ribas,27 of 46 17 of 56,58% 30%,13 of 27 10 of 44,3 of 8 1 of 4,11 of 11 6 of 8,18 of 34 14 of 51,1 of 3 0 of 0,8 of 9 3 of 5


In [76]:
test[3]

Unnamed: 0,Fighter,Sig. str,Sig. str. %,Head,Body,Leg,Distance,Clinch,Ground,Unnamed: 9
0,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1,Round 1
1,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2,Round 2
2,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3,Round 3
3,Mackenzie Dern Amanda Ribas,12 of 25 10 of 33,48% 30%,4 of 15 6 of 26,2 of 4 1 of 3,6 of 6 3 of 4,12 of 25 9 of 32,0 of 0 0 of 0,0 of 0 1 of 1,
4,Mackenzie Dern Amanda Ribas,3 of 3 3 of 6,100% 50%,1 of 1 2 of 5,0 of 0 0 of 0,2 of 2 1 of 1,2 of 2 1 of 4,0 of 0 0 of 0,1 of 1 2 of 2,
5,Mackenzie Dern Amanda Ribas,12 of 18 4 of 17,66% 23%,8 of 11 2 of 13,1 of 4 0 of 1,3 of 3 2 of 3,4 of 7 4 of 15,1 of 3 0 of 0,7 of 8 0 of 2,


In [100]:
t3=fight_soup('table')[3]

In [111]:
for col in t3.thead('th'):
    print(col.get_text(strip=True))

Fighter
Sig. str
Sig. str. %
Head
Body
Leg
Distance
Clinch
Ground


In [109]:
t3.thead.th.get_text(strip=True)

'Fighter'

In [None]:
def _get_table_headers(table):
    # given an HTML table as a soup object, extract (first?) header row as list.
    return None