In [None]:
import concurrent.futures
from io import StringIO
import os
from pathlib import Path
import pickle
from typing import Dict, List, Tuple
import sys
import requests

from bs4 import BeautifulSoup
import pandas as pd

In [3]:
BASE_PATH = Path("D:/code/frankie_edgar_stan_zone") / "data"
FIGHT_LINKS_PICKLE = BASE_PATH / "fight_links.pickle"
PAST_EVENT_LINKS_PICKLE = BASE_PATH / "past_event_links.pickle"
PAST_FIGHTER_LINKS_PICKLE = BASE_PATH / "past_fighter_links.pickle"
SCRAPED_FIGHTER_DATA_DICT_PICKLE = BASE_PATH / "scraped_fighter_data_dict.pickle"
NEW_FIGHTS_DATA_PATH = BASE_PATH / "new_fight_data.csv"
TOTAL_FIGHTS_DATA_PATH = BASE_PATH / "raw_total_fight_data.csv"
PREPROCESSED_DATA_PATH = BASE_PATH / "preprocessed_data.csv"
FIGHTER_DETAILS_DATA_PATH = BASE_PATH / "raw_fighter_details.csv"
UFC_DATA_PATH = BASE_PATH / "data.csv"
EVENT_DATA_PATH = BASE_PATH / "event_data.csv"


In [10]:
# Parameters for data schema and column names go here.
# Not needed if inferring column names from headers

# some ufc table headers have wrong names in source code (TD is often labelled TD% internally)
# which busts pd.read_html because it sees two TD% columns.
# praying for consistent schema and using these labels instead trying
# to parse headers will be faster and simpler than writing parsing code
# ...if it works
# _________
# FIGHT DETAILS columns as listed on UFC stats website
# e.g. http://ufcstats.com/fight-details/eaa885cf7ae31e0b
web_fight_cols = [
    "FIGHTER",
    "KD",
    "SIG STR",
    "SIG STR%",
    "TOTAL STR",
    "TD",
    "TD%",
    "REV",
    "CTRL",
]

web_strike_cols = [
    "FIGHTER",
    "SIG STR",
    "SIG STR%",
    "HEAD",
    "BODY",
    "LEG",
    "DISTANCE",
    "CLINCH",
    "GROUND",
]
# _________

# cols of event data saved locally
event_cols = [
    "ID",
    "TITLE",
    "DATE",
    "LOCATION",
    "LINK",
    "FIGHT_LINKS_SCRAPED",
    "FIGHT_DATA_SCRAPED",
]

# column labels for processed fight data
# each fighter (R: red, B: blue)
# gets total stats (_TOT) suffix
# and round stats (_R#)
# everyone gets stats for 5 rounds to keep schema consistant
# doing this programatically to write less
# these will have dependencies (tots = sum of  rounds, percents, etc)
# that you might want to remove before shoving into an ML model
# but making columns for everything for granularity/readability

shared_cols = [
    "FIGHT_ID",
    "FIGHT_LINK",
    "TITLE_FIGHT",  # true/false
    "R_FIGHTER",
    "R_FIGHTER_ID",
    "L_FIGHTER",
    "L_FIGHTER_ID",
    "WINNER",
    "METHOD",
    "WIN_RND",
    "WIN_TIME",
    "FORMAT",  # 5 round or 3 round
    "DETAILS",  # could be judge scores or more details on finish, needs processing
    "REFEREE",
    "EVENT_ID",
    "EVENT_TITLE",
    "EVENT_DATE",
    "EVENT_LOC",
    "EVENT_BOUT_NUM",  # 1= headliner, 2= coheadliner, etc...]
]
# columns without percents
# aka stuff that doesn't need ATT, LND, and PCT suffixes
# these could have a better variable name
gen_stat_cols = ["KD", "SUB_ATT", "REV", "CTRL_TIME"]

# head/body/leg/distance/clinch/ground
# numbers are only for sig strikes --
# breakdowns not included for non-sig strikes
# omitting sig_str prefix for these for readability

pct_stat_cols = [
    "SIG_STR",
    "ALL_STR",
    "TD",
    "HEAD_STR",
    "BODY_STR",
    "LEG_STR",
    "DISTANCE_STR",
    "CLINCH_STR",
    "GROUND_STR",
]


In [5]:
def make_soup(url: str) -> BeautifulSoup:
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    return BeautifulSoup(plain_text, "html.parser")

In [None]:
# helper functions for adding prefixes/suffixes to dictionary labels.
# lots of stats can be scraped via same routine iterated over some html object
# but need different prefixes/suffixes (e.g. R or B for red or blue fighter, or R1-R5 for round)
def add_prefix_label(old_dict: dict[str, str], prefix: str) -> dict[str, str]:
    new_dict = {}

    for lbl, val in old_dict.items():
        new_lbl = f"{prefix}_{lbl}"
        new_dict[new_lbl] = val

    return new_dict

def add_suffix_label(old_dict: dict[str, str], suffix: str) -> dict[str, str]:
    new_dict = {}

    for lbl, val in old_dict.items():
        new_lbl = f"{lbl}_{suffix}"
        new_dict[new_lbl] = val

    return new_dict

In [6]:
def print_progress(
    iteration: int,
    total: int,
    prefix: str = "",
    suffix: str = "",
    decimals: int = 1,
    bar_length: int = 50,
) -> None:
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        bar_length  - Optional  : character length of bar (Int)
    """
    percents = f"{100 * (iteration / float(total)):.2f}"
    filled_length = int(round(bar_length * iteration / float(total)))
    bar = f'{"█" * filled_length}{"-" * (bar_length - filled_length)}'

    sys.stdout.write(f"\r{prefix} |{bar}| {percents}% {suffix}")

    if iteration == total:
        sys.stdout.write("\n")
    sys.stdout.flush()


In [7]:
class UFCLinks:
    def __init__(
        self, all_events_url="http://ufcstats.com/statistics/events/completed?page=all"
    ):
        self.all_events_url = all_events_url
        self.EVENT_DATA_PATH = EVENT_DATA_PATH
        self.EVENT_DATA = None
        self.FIGHT_LINKS_PICKLE_PATH = FIGHT_LINKS_PICKLE
        self.FIGHT_LINKS = None
        self._initiate_class()

    def _scrape_all_events(self) -> pd.DataFrame:
        # reads all events from all_events_url column and
        # initiates event data table as dataframe.
        event_text = ";".join(event_cols)
        soup = make_soup(self.all_events_url)
        for row in soup.tbody.findAll("tr", {"class": "b-statistics__table-row"}):

            # case handling for blank row that exists at top of table.
            # text is just empty string/newline chars
            if row.text.strip() == "":
                continue

            link_elt = row.find("a")
            event_title = link_elt.text.strip().upper()
            event_link = link_elt.get("href")
            event_id = event_link.split("/")[-1]

            event_date = (
                row.find("span", {"class": "b-statistics__date"}).text.strip().upper()
            )

            # taking for granted that event location is last td element in row.
            event_location = row.findAll("td")[-1].text.strip().upper()

            event_text += "\n" + ";".join(
                [
                    event_id,
                    event_title,
                    event_date,
                    event_location,
                    event_link,
                    "False",
                    "False",
                ]
            )

        # pass through stringIO so this csv like text string can be plugged into pandas read_csv
        event_data = StringIO(event_text)
        event_df = pd.read_csv(event_data, sep=";")
        # reformat datetimes
        event_df["DATE"] = pd.to_datetime(event_df["DATE"], format="%B %d, %Y")
        # change ID to index
        event_df = event_df.set_index("ID")

        return event_df

    def _write_event_data(self, df):
        filepath = self.EVENT_DATA_PATH
        df.to_csv(filepath, sep=";")

        return df

    def _initiate_class(self):
        # get latest event data from web
        print(f"Pulling event data from {self.all_events_url}")
        web_event_df = self._scrape_all_events()
        web_event_ids = web_event_df.index

        if not self.EVENT_DATA_PATH.exists():
            # if no event data file, initate event data by writing this to csv
            # with no comparisons

            print(
                f"No existing event data, writing web data locally to {self.EVENT_DATA_PATH}"
            )
            self._write_event_data(web_event_df)
            # label for return data
            event_df = web_event_df
        else:
            # otherwise, event data file already exists.
            # compare with all_event_df by id and only write rows
            # that aren't present in existing file
            print(f"Reading local event data from {self.EVENT_DATA_PATH}")
            local_event_df = pd.read_csv(
                self.EVENT_DATA_PATH, sep=";", parse_dates=["DATE"], index_col="ID"
            )

            local_event_ids = local_event_df.index
            new_event_ids = web_event_ids.difference(local_event_ids)

            # return local data unless new events present in web data.

            if not new_event_ids.empty:
                # append  new events to beginning of DF and overwrite file
                # we could make it only write the new rows, but this file is small enough that i don't care
                # and sorting semantics are easier like this.
                print(f"{len(new_event_ids)} new event/s. Updating local event data.")
                #return  web_event_df, new_event_ids, local_event_df
                updated_df = pd.concat([web_event_df.loc[new_event_ids], local_event_df])
                self._write_event_data(updated_df)
                # return updated event df if new events present in web
                event_df = updated_df
            else:
                #otherwise, no new events, local event data still valid.
                print("No new events, local data up to date")
                event_df = local_event_df

        # set event data property
        self.EVENT_DATA = event_df

        # load fight links if they already exist.
        if self.FIGHT_LINKS_PICKLE_PATH.exists():
            print(f"Loading local fight links from {self.FIGHT_LINKS_PICKLE_PATH}")
            # load prev events and links
            with open(self.FIGHT_LINKS_PICKLE_PATH, "rb") as event_fight_dict:
                prev_fight_links = pickle.load(event_fight_dict)
                self.FIGHT_LINKS = prev_fight_links

        return event_df

    # given list of event links, gets all links to fights for that event and
    # stores in dictionary using event link as key
    def _make_link_dict(self, event_links: list[str]) -> dict[str, str]:

        num_events = len(event_links)
        event_fight_dict = {}
        print(f"Scraping fight links from {num_events} events: ")
        print_progress(0, num_events, prefix="Progress:", suffix="Complete")
        for index, link in enumerate(event_links):
            event_fights = []
            soup = make_soup(link)
            for row in soup.findAll(
                "tr",
                {
                    "class": "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"
                },
            ):
                href = row.get("data-link")
                event_fights.append(href)

            event_fight_dict[link] = event_fights

            print_progress(index + 1, num_events, prefix="Progress:", suffix="Complete")

        return event_fight_dict

    def _write_fight_links(self):
        # might not need this as a subfunction but i don't want to write it twice
        with open(self.FIGHT_LINKS_PICKLE_PATH, "wb") as f:
            pickle.dump(self.FIGHT_LINKS, f)

    def _initiate_fight_links(self):
        # to initiate, make dict from all event data links
        event_df = self.EVENT_DATA
        event_fight_link_dict = self._make_link_dict(event_df["LINK"])
        return event_fight_link_dict

    def _get_unscraped_fight_links(self):
        event_df = self.EVENT_DATA
        links_to_scrape = event_df[~event_df["FIGHT_LINKS_SCRAPED"]]["LINK"]
        if links_to_scrape.empty:
            print("No new event links to scrape.")
            new_fight_links = {}
        else:
            new_fight_links = self._make_link_dict(links_to_scrape)
        return new_fight_links

    def _update_event_fight_link_scraped_status(self):
        # get ids from event-fight dict keys.
        # assuming that if event is in there, fight links have been scraped.
        # it's not really airtight logic, but good enough for now
        scraped_ids = [id.split("/")[-1] for id in self.FIGHT_LINKS.keys()]
        event_df = self.EVENT_DATA
        event_df.loc[scraped_ids, "FIGHT_LINKS_SCRAPED"] = True
        self.EVENT_DATA = event_df
        self._write_event_data(event_df)
        return event_df

    def get_fight_links(self, force_refresh=False):

        # if force_refresh is True, retrieves all fight links from events regardless
        # of FIGHT_LINKS_SCRAPED value (refresh also forced if fight link file doesnt exist)
        # otherwise, only scrapes links where FIGHT_LINKS_SCRAPED == False
        if force_refresh or not self.FIGHT_LINKS_PICKLE_PATH.exists():
            print(f"Scraping all fight links to {self.FIGHT_LINKS_PICKLE_PATH}")
            fight_link_dict = self._initiate_fight_links()
        else:
            print("Checking for new events to scrape")
            new_fight_links = self._get_unscraped_fight_links()
            fight_link_dict = self.FIGHT_LINKS.copy()
            fight_link_dict.update(new_fight_links)

        self.FIGHT_LINKS = fight_link_dict
        self._update_event_fight_link_scraped_status()
        self._write_fight_links()

        return fight_link_dict

In [None]:
ufc_links=UFCLinks()
ufc_links.get_fight_links()

Pulling event data from http://ufcstats.com/statistics/events/completed?page=all
Reading local event data from D:\code\frankie_edgar_stan_zone\data\event_data.csv
4 new event/s. Updating local event data.
Loading local fight links from D:\code\frankie_edgar_stan_zone\data\fight_links.pickle


In [None]:
# not sure if i'm actually using this

def append_col_prefix_suffix(
    fighter_prefix=("B", "R"),
    pct_suffix=("LND", "ATT", "PCT"),
    rnd_suffix=("TOT", "R1", "R2", "R3", "R4", "R5"),
    shared_cols=shared_cols,
    gen_cols=gen_stat_cols,
    pct_cols=pct_stat_cols,
):
    gen_stat_cols = [f"{pre}_{stat}_{r_suf}" for pre in fighter_prefix for stat in gen_cols for r_suf in rnd_suffix]
    pct_stat_cols = [
        f"{pre}_{stat}_{p_suf}_{r_suf}"
        for pre in fighter_prefix
        for stat in pct_cols
        for r_suf in rnd_suffix
        for p_suf in pct_suffix
    ]
    fight_cols = shared_cols + gen_stat_cols + pct_stat_cols
    return fight_cols


fight_cols = append_col_prefix_suffix()


In [None]:
# original code for reference

# from src.createdata.scrape_fight_links import UFCLinks
# from src.createdata.utils import make_soup, print_progress

# from src.createdata.data_files_path import (  # isort:skip
#     NEW_EVENT_AND_FIGHTS,
#     TOTAL_EVENT_AND_FIGHTS,
# )


# assuming red corner is always listed first.
class FightDataScraper:
    def __init__(self):
        self.HEADER: str = (
            "R_fighter;B_fighter;R_KD;B_KD;R_SIG_STR.;B_SIG_STR.\
;R_SIG_STR_pct;B_SIG_STR_pct;R_TOTAL_STR.;B_TOTAL_STR.;R_TD;B_TD;R_TD_pct\
;B_TD_pct;R_SUB_ATT;B_SUB_ATT;R_REV;B_REV;R_CTRL;B_CTRL;R_HEAD;B_HEAD;R_BODY\
;B_BODY;R_LEG;B_LEG;R_DISTANCE;B_DISTANCE;R_CLINCH;B_CLINCH;R_GROUND;B_GROUND\
;win_by;last_round;last_round_time;Format;Referee;date;location;Fight_type;Winner\n"
        )

        self.NEW_FIGHTS_DATA_PATH = NEW_FIGHTS_DATA_PATH
        self.TOTAL_FIGHTS_DATA_PATH = TOTAL_FIGHTS_DATA_PATH

    def create_fight_data_csv(self) -> None:
        print("Scraping links!")

        ufc_links = UFCLinks()
        new_fight_links, all_fight_links = (
            ufc_links.get_fight_links()
        )
        print("Successfully scraped and saved fight links!\n")
        print("Now, scraping fight data!\n")

        # are there new fight links to scrap data from?
        if not new_fight_links:
            # if there's no new fight links
            if self.TOTAL_FIGHTS_DATA_PATH.exists():
                # if fight data csv file exists.

                # assume fight data up to date
                # this is not actually necessarily true
                # but good enough for now
                print(
                    f"""No new fight data to scrape.
                        {self.TOTAL_EVENT_AND_FIGHTS_PATH} up to date."""
                )
                return None
            else:
                # if no data csv, scrape all fights and make it.
                self._scrape_raw_fight_data(
                    all_fight_links,
                    filepath=self.TOTAL_FIGHTS_PATH,
                )
        else:
            # scrape only fights from new events
            self._scrape_raw_fight_data(
                new_fight_links, filepath=self.NEW_EVENT_AND_FIGHTS_PATH
            )

            new__fights_data = pd.read_csv(self.NEW_FIGHTS_PATH)
            old_fights_data = pd.read_csv(self.TOTAL_FIGHTS_PATH)

            # verify same column count
            assert len(new_fights_data.columns) == len(
                old_fights_data.columns
            )

            # restricts new event cols to those with labels of old events/ensures same col order
            # feels like merging new/old fight data should be a seperate method
            new_fights_data = new_fights_data[list(old_fights_data.columns)]

            # might be worth verifying integrity here
            latest_total_fight_data = pd.concat(
                [new_fights_data, old_fights_data],
                axis=1,
                ignore_index=True,
            )

            latest_total_fight_data.to_csv(self.TOTAL_FIGHTS_PATH, index=None)
            print(f"Updated {self.TOTAL_FIGHTS_PATH} with new fight data")
            os.remove(self.NEW_EVENT_AND_FIGHTS_PATH)
            print("Removed temporary files.")

        print("Successfully scraped and saved UFC fight data!")

    def _scrape_raw_fight_data(
        self, event_and_fight_links: Dict[str, List[str]], filepath
    ):
        if filepath.exists():
            print(f"File {filepath} already exists, overwriting.")

        total_stats = self._get_total_fight_stats(event_and_fight_links)
        with open(filepath.as_posix(), "wb") as file:
            file.write(bytes(self.HEADER, encoding="ascii", errors="ignore"))
            file.write(bytes(total_stats, encoding="ascii", errors="ignore"))

    def _get_fight_stats_task(self, fight, event_info):
        total_fight_stats = ""
        try:
            fight_soup = make_soup(fight)
            fight_stats = self._get_fight_stats(fight_soup)
            fight_details = self._get_fight_details(fight_soup)
            result_data = self._get_fight_result_data(fight_soup)
            total_fight_stats = (
                fight_stats + ";" + fight_details + ";" + event_info + ";" + result_data
            )
        except Exception as e:
            print("Error getting fight stats, " + str(e))
            pass

        return total_fight_stats

    def _get_total_fight_stats(self, fight_links: Dict[str, List[str]]) -> str:
        total_stats = ""

        fight_count = len(fight_links)
        print(f"Scraping data for {fight_count} fights: ")
        print_progress(0, fight_count, prefix="Progress:", suffix="Complete")

        for index, (event, fights) in enumerate(fight_links.items()):
            event_soup = make_soup(event)
            event_info = self._get_event_info(event_soup)

            # Get data for each fight in the event in parallel.
            with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
                futures = []
                for fight in fights:
                    futures.append(
                        executor.submit(
                            self._get_fight_stats_task,
                            fight=fight,
                            event_info=event_info,
                        )
                    )
                for future in concurrent.futures.as_completed(futures):
                    fight_stats = future.result()
                    if fight_stats != "":
                        if total_stats == "":
                            total_stats = fight_stats
                        else:
                            total_stats = total_stats + "\n" + fight_stats
                    print_progress(index + 1, fight_count, prefix="Progress:", suffix="Complete")

        return total_stats

    def _get_fight_stats(self, fight_soup: BeautifulSoup) -> str:
        tables = fight_soup.findAll("tbody")
        # hard coded to grab totals and significant strike stats.
        # skips per round stats
        # i think we want per round stats.
        total_fight_data = [tables[0], tables[2]]
        fight_stats = []
        for table in total_fight_data:
            row = table.find("tr")
            stats = ""
            for data in row.findAll("td"):
                if stats == "":
                    stats = data.text
                else:
                    stats = stats + "," + data.text
            fight_stats.append(
                stats.replace("  ", "")
                .replace("\n\n", "")
                .replace("\n", ",")
                .replace(", ", ",")
                .replace(" ,", ",")
            )

        #hardcoded here to ignore first 3 cols of significant strikes table
        fight_stats[1] = ";".join(fight_stats[1].split(",")[6:])
        fight_stats[0] = ";".join(fight_stats[0].split(","))
        fight_stats = ";".join(fight_stats)
        return fight_stats

    def _get_fight_details(self, fight_soup: BeautifulSoup) -> str:
        columns = ""
        for div in fight_soup.findAll("div", {"class": "b-fight-details__content"}):
            for col in div.findAll("p", {"class": "b-fight-details__text"}):
                if columns == "":
                    columns = col.text
                else:
                    columns = columns + "," + (col.text)

        columns = (
            columns.replace("  ", "")
            .replace("\n\n\n\n", ",")
            .replace("\n", "")
            .replace(", ", ",")
            .replace(" ,", ",")
            .replace("Method: ", "")
            .replace("Round:", "")
            .replace("Time:", "")
            .replace("Time format:", "")
            .replace("Referee:", "")
        )

        fight_details = ";".join(columns.split(",")[:5])

        return fight_details


    def _get_event_info(self, event_link: str) -> str:
        # use hash in URL as event id.
        event_id = event_link.split('/')[-1]

        event_soup = make_soup(event_link)
        event_title = event_soup.find('h2', {"class":"b-content__title"}).text.strip()
        # take whatever's after the colon, strip whitespace and upper case it.
        # hoping it's just date/location respectively, otherwise this is gonna get wonky.
        event_attr = [attr.text.split(':')[-1].strip().upper() for attr in event_soup.findAll("li", {"class": "b-list__box-list-item"})]

        # should spit out semicolon seperated string
        # id;title;date;location
        event_info =";".join([event_id, event_title] + event_attr)

        return event_info

    def _get_fight_result_data(self, fight_soup: BeautifulSoup) -> str:
        winner = ""
        for div in fight_soup.findAll("div", {"class": "b-fight-details__person"}):
            if (
                div.find(
                    "i",
                    {
                        "class": "b-fight-details__person-status b-fight-details__person-status_style_green"
                    },
                )
                is not None
            ):
                winner = (
                    div.find("h3", {"class": "b-fight-details__person-name"})
                    .text.replace(" \n", "")
                    .replace("\n", "")
                )

        fight_type = (
            fight_soup.find("i", {"class": "b-fight-details__fight-title"})
            .text.replace("  ", "")
            .replace("\n", "")
        )

        return fight_type + ";" + winner


In [None]:
# parsing fighter names/results here

# given single "b-fight-details__person" element, get name, link and result.
def _get_fighter(fighter_raw: BeautifulSoup) -> dict:
    name = fighter_raw.a.text.strip().upper()
    link = fighter_raw.a.get('href')
    id = link.split("/")[-1]
    result = fighter_raw.i.text.strip().upper()

    fighter = {"FIGHTER": name,
               "FIGHTER_ID": id,
               "FIGHTER_LINK":link,
               "FIGHTER_RESULT": result}


    return fighter

def _get_fighters(fight_soup: BeautifulSoup) -> dict:
    fighters={}

    r_raw, b_raw = fight_soup.find_all('div', {"class": "b-fight-details__person"})

    r = _get_fighter(r_raw)
    b = _get_fighter(b_raw)

    fighters = add_prefix_label(r, "R") | add_prefix_label(b, "B")

    return fighters

In [None]:
# scraping fight attributes (everything in the the non-tabular box) and all associated routines HERE

# fight name might say HEAVYWEIGHT BOUT, or UFC TITLE HEAVYWEIGHT BOUT
# helper function picks out the word with 'weight' in it
# expects string to already be stripped/capitalized/seperated by spaces
def _parse_weightclass(fight_name):
    for word in fight_name.split(" "):
        if "WEIGHT" in word:
            return word
        else:
            continue
    # no word with "weight" in fight name
    return "WEIGHTCLASS PARSING ERROR"

# couple of cases here because of changes in UFC methodology that i'm merging together.
# current UFC awards FOTN and performance bonuses for best finishes
# used to award KOTN and SOTN specficially and icons indicating these are still
# in data. i'm gonna record them all as performance bonuses.
# FOTN still distinguished seperately.
# all of these are are just marked by embedded images so this check is super hardcoded.
def _is_perf_bonus(attr_soup: BeautifulSoup) -> bool:
    for img in attr_soup.i.find_all("img"):
        src = img.get('src')
        if ("ko.png" in src) or ("perf.png" in src) or ("fight.png" in src) or ("sub.png" in src):
            return True
    # otherwise false
    return False

    # this function might be useful other places, might generalize
def _parse_attr(p_soup:BeautifulSoup) -> dict:
    # each top level i tag in this p block is one attr
    attr_dict = {}
    for i_raw in p_soup.findAll('i', recursive=False):
        # smash (like khamzat) together, then split at :
        attr= " ".join(i_raw.stripped_strings).upper().split(': ')
        attr_lbl = attr[0]
        attr_txt = attr[1]
        # print(attr)
        # print(attr_lbl)
        # print(attr_txt)
        attr_dict[attr_lbl] = attr_txt


    return attr_dict

# given "b-fight-details__fight" html soup,
# parses content ("b-fight-details__content") (method, round, etc)
def _get_attr_content(attr_soup: BeautifulSoup) -> Dict:
    # logic for parsing attributes out of the two p blocks
    # is annoying because of inconsistency in i tag usage.

    # ASSUMING THAT THE FOLLOWING CALL ONLY FINDS TWO P TAGS (one with method/round/etc. and second with details:)
    # each has its own parsing
    p_attr, p_details = attr_soup.find('div', {"class": "b-fight-details__content"}).findAll('p')

    attr_dict = _parse_attr(p_attr)

    #details value is special case
    attr_dict["DETAILS"]= " ".join(p_details.stripped_strings).upper().split(": ")[-1]

    return attr_dict



def _get_fight_attr(fight_soup: BeautifulSoup) -> Dict:
    attr_raw = fight_soup.find('div', {"class": "b-fight-details__fight"})
    fight_name = attr_raw.i.text.strip().upper()
    weight = _parse_weightclass(fight_name)

    # detecting title fights by the word "TITLE" in fight name
    # could also do this by looking for belt icon/css tag
    title_fight = "TITLE" in fight_name
    perf_bonus = _is_perf_bonus(attr_raw)

    # initialize attr_dict with attr content then manually add
    # weight class, title fight and performance bonus flags
    attr_dict = _get_attr_content(attr_raw)
    attr_dict["WEIGHT_CLASS"] = weight
    attr_dict["TITLE_FIGHT"] = title_fight
    attr_dict["PERF_BONUS"] = perf_bonus

    return attr_dict

In [None]:
def get_fight_stats(fight_link: str) -> dict:
    fight_soup = make_soup(fight_link)

    # - 4 things to grab
    # - fighter details (name, result, fighter_link (or ID))
    # - fight attributes (winner, method, etc -- everything before the tables)
    # - general stats (kd, td, sub att, rev,  ctrl) - these are deceptively labeled TOTALS
    # (we don't need sig str cols from this table b/c they're repeated in the strike table )
    # - sig strike data.

    #initiate fight_stats dict with LINK and ID.
    fight_id = fight_link.split("/")[-1]
    fight_stats={"FIGHT_ID": fight_id,
                 "FIGHT_LINK": fight_link}

    fight_fighters = _get_fighters(fight_soup)
    fight_attr = _get_fight_attr(fight_soup)
    ###########

    # EVERYTHING BELOW THIS IS TO DO
    fight_gen_stats = _get_fight_gen_stats(fight_soup)
    fight_strike_stats = _get_fight_strike_stats(fight_soup)

    fight_stats.update(fight_attr)
    fight_stats.update(fight_gen_stats)
    fight_stats.update(fight_strike_stats)

    return fight_stats

In [180]:
event_soup = make_soup('http://ufcstats.com/event-details/39f68882def7a507')
volk_lopes_soup = make_soup('http://ufcstats.com/fight-details/e733f148060bef2a')
krylov_reyes_soup = make_soup('http://ufcstats.com/fight-details/b2d731415bd367df')