# UFC Prediction Model - Data gathering
# Goal: 
1) Scrape data from UFCStats.com
2) Transform/clean data

In [103]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from urllib.parse import urlparse

# Scrape Website

## Gather all fight links from events page

In [None]:
def get_event_links():
    """
    Scrape the page with all completed events and return a list of event URLs.
    """
    url = 'http://ufcstats.com/statistics/events/completed?page=all'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    event_links = []
    # The event links are found in anchor tags with class 'b-link b-link_style_black'
    for a in soup.select('a.b-link.b-link_style_black'):
        href = a.get('href')
        if href and 'event-details' in href:
            event_links.append(href)
    return list(set(event_links))  # Remove duplicates if any


'\ndef get_event_details(soup):\n    """\n    Given the BeautifulSoup object of an event page, extract event-level details.\n    """\n    details = {}\n    # Assume event date and location are in <li> tags with class \'b-list__box-list-item\'\n    items = soup.find_all(\'li\', class_=\'b-list__box-list-item\')\n    for item in items:\n        text = item.get_text(strip=True)\n        if \'Date:\' in text:\n            details[\'Event Date\'] = text.replace(\'Date:\', \'\').strip()\n        elif \'Location:\' in text:\n            details[\'Location\'] = text.replace(\'Location:\', \'\').strip()\n    return details\n'

## Get event details

In [None]:
def get_event_id_from_url(url):
    """
    Extract the unique event ID from the event page URL.
    """
    parsed = urlparse(url)
    return parsed.path.split("/")[-1]

def get_event_details(soup, url):
    """
    Given the BeautifulSoup object of an event page and its URL,
    extract event-level details including ID, name, date, and location.
    """
    details = {}

    # 1. Event ID from URL
    details['Event ID'] = get_event_id_from_url(url)

    # 2. Event Name from header
    name_tag = soup.select_one("h2.b-content__title > span.b-content__title-highlight")
    if name_tag:
        details['Event Name'] = name_tag.get_text(strip=True)

    # 3. Date and Location from <li> tags
    items = soup.find_all('li', class_='b-list__box-list-item')
    for item in items:
        text = item.get_text(strip=True)
        if 'Date:' in text:
            details['Event Date'] = text.replace('Date:', '').strip()
        elif 'Location:' in text:
            details['Location'] = text.replace('Location:', '').strip()

    return details

## Parse per round data

In [48]:
def parse_rounds_from_combined_table(table):
    """
    Parses UFCStats per-round fighter stats, associating each <thead> round label with the following <tbody> rows.
    """
    round_stats = []
    
    current_round = None

    elements = list(table.find_all(['thead', 'tbody']))

    for element in elements:
        if element.name == "thead":
            classes = element.get("class", [])
            if "b-fight-details__table-head_rnd" in classes:
                continue  # Skip column header thead

            #print(f"📘 Advancing to Round {round_number}")
        
        elif element.name == "tbody":
            rows = element.find_all("tr", class_="b-fight-details__table-row")
            round_number = 1

            for row in rows:
                cols = row.find_all("td")
                fighter_1_data = []
                fighter_2_data = []

                for col_idx, td in enumerate(cols):
                    if col_idx == 0:
                        names = td.find_all("a")
                        if len(names) == 2:
                            fighter_1_data.append(names[0].get_text(strip=True))
                            fighter_2_data.append(names[1].get_text(strip=True))
                        else:
                            fighter_1_data.append("")
                            fighter_2_data.append("")
                    else:
                        ps = td.find_all("p")
                        if len(ps) == 2:
                            fighter_1_data.append(ps[0].get_text(strip=True))
                            fighter_2_data.append(ps[1].get_text(strip=True))
                        else:
                            fighter_1_data.append("")
                            fighter_2_data.append("")

                round_stats.append({
                    "round": f"Round {round_number}",
                    "fighter_1": fighter_1_data,
                    "fighter_2": fighter_2_data
                })

                round_number += 1  # Move to next round after each tbody

    return round_stats


## Parse overall data

In [49]:
def parse_table_dual_rows(table):
    """
    Parses tables where each <td> has two <p> tags representing two fighters.
    Returns a dict of stat_name: (fighter1_val, fighter2_val)
    """
    headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")]
    rows = table.find("tbody").find_all("tr")[0].find_all("td")

    data = {"fighter_1": {}, "fighter_2": {}}
    for i, header in enumerate(headers[1:], 1):
        values = rows[i].find_all("p")
        if len(values) == 2:
            data["fighter_1"][header] = values[0].get_text(strip=True)
            data["fighter_2"][header] = values[1].get_text(strip=True)
    return data

## Get fight stats

In [50]:
def get_fight_stats(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract winner and loser based on W/L status
    fight_outcomes = {}
    person_blocks = soup.select(".b-fight-details__person")
    for person in person_blocks:
        status_tag = person.select_one(".b-fight-details__person-status")
        name_tag = person.select_one(".b-fight-details__person-name a")
        if status_tag and name_tag:
            status = status_tag.get_text(strip=True)
            name = name_tag.get_text(strip=True)
            if status == "W":
                fight_outcomes["winner"] = name
            elif status == "L":
                fight_outcomes["loser"] = name

    # Extract fighter names
    fighter_tags = soup.select(".b-fight-details__person-name a")
    if len(fighter_tags) < 2:
        return None
    fighter_1 = fighter_tags[0].get_text(strip=True)
    fighter_2 = fighter_tags[1].get_text(strip=True)

    fight_data = {
        "fighter_1_name": fighter_1,
        "fighter_2_name": fighter_2,
        "winner": fight_outcomes.get("winner"),
        "loser": fight_outcomes.get("loser"),
        "overall_stats": {},
        "per_round_stats": {},
        "sig_strike_breakdown": {},
        "per_round_sig_strikes": {}
    }

    # Extract metadata
    metadata_section = soup.select_one(".b-fight-details__content")
    if metadata_section:
        for item in metadata_section.select(".b-fight-details__text-item"):
            label = item.select_one(".b-fight-details__label")
            if label:
                key = label.get_text(strip=True).replace(":", "").lower()
                value = item.get_text(strip=True).replace(label.get_text(strip=True), "").strip()
                fight_data[key] = value

    # Extract tables
    tables = soup.select("div.b-fight-details table")

    current_section = None

    for table in tables:
        headers = [th.get_text(strip=True) for th in table.find_all("th")]
        #print(f"{headers}")

        if 'KD' in headers and 'Round 1' not in headers:
            fight_data["overall_stats"] = parse_table_dual_rows(table)
        elif 'KD' in headers and 'Round 1' in headers:
            fight_data["per_round_stats"] = parse_rounds_from_combined_table(table)
        elif 'Head' in headers and 'Round 1' not in headers:
            fight_data["sig_strike_breakdown"] = parse_table_dual_rows(table)
        elif 'Head' in headers and 'Round 1' in headers:
            fight_data["per_round_sig_strikes"] = parse_rounds_from_combined_table(table)
        else:
            print('No headers found that matched...')
            pass

    return fight_data

## Get fights from events

In [51]:
def get_fights_from_event(event_url):
    """
    Given an event URL, scrape the event page to extract:
      - Basic fight details (fighter names, winner, weight class, method, round, time)
      - Event details (date and location)
      - Detailed fighter statistics from the fight detail page
    """
    fights = []
    try:
        response = requests.get(event_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get event-level details (date and location)
        event_details = get_event_details(soup, event_url)

        # Locate the fight table on the event page
        table = soup.find('table', class_='b-fight-details__table')
        if not table:
            return fights

        rows = table.find_all('tr')[1:]  # Skip header row

        for row in rows:
            cols = row.find_all('td')
            if len(cols) < 10:
                continue

            # Extract fighter names from Fighter A column (usually has both names)
            fighter_links = cols[1].find_all('a')
            if len(fighter_links) >= 2:
                fighter_1 = fighter_links[0].get_text(strip=True)
                fighter_2 = fighter_links[1].get_text(strip=True)
            else:
                # Fallback if structure changes
                fighter_1 = cols[1].get_text(strip=True)
                fighter_2 = cols[2].get_text(strip=True)

            # Determine winner based on binary string
            raw_win_code = cols[0].get_text(strip=True)
            win_code = raw_win_code[:2]  # Just in case it has extra characters
            winner = "Unknown"

            if win_code == "10":
                winner = fighter_1
            elif win_code == "01":
                winner = fighter_2
            elif win_code == "00" or "Draw" in raw_win_code or "No Contest" in raw_win_code:
                winner = "Draw"

            # Build fight record
            fight = {
                'Fighter A': fighter_1,
                'Fighter B': fighter_2,
                'Winner': winner,
                'Weight Class': cols[6].get_text(strip=True),
                'Method': cols[7].get_text(strip=True),
                'Round': cols[8].get_text(strip=True),
                'Time': cols[9].get_text(strip=True),
                'Event Date': event_details.get('Event Date', None),
                'Location': event_details.get('Location', None),
                'Event Name': event_details.get('Event Name', None),
                'Event ID': event_details.get('Event ID', None)
            }

            # Extract fight detail URL from the row
            fight_link_tag = row.find('a')
            fight_detail_url = fight_link_tag['href'] if fight_link_tag else None
            if fight_detail_url:
                fight_stats = get_fight_stats(fight_detail_url)
                fight.update(fight_stats)
                time.sleep(1)

            fights.append(fight)
    except Exception as e:
        print(f"Error processing event {event_url}: {e}")
    return fights

## Main function that loops through each event

In [53]:
def scrape_ufc_data(limit_events=None):
    """
    Scrape data for all events (or limit to a set number) and compile fight details.
    """
    all_fights = []
    event_urls = get_event_links()
    print(f"Found {len(event_urls)} events.")
    if limit_events:
        event_urls = event_urls[:limit_events]
    
    for i, event_url in enumerate(event_urls, start=1):
        print(f"Processing event {i}/{len(event_urls)}: {event_url}")
        fights = get_fights_from_event(event_url)
        all_fights.extend(fights)
        time.sleep(2)  # Delay between event requests
        
    return all_fights

## Run code here to scrape website

In [54]:
if __name__ == "__main__":
    # For testing purposes, limit to the first 10 events. Remove or adjust 'limit_events' for full scraping.
    #fights_data = scrape_ufc_data(limit_events=1)
    fights_data = scrape_ufc_data()
    df = pd.DataFrame(fights_data)
    #display(df.head())
    df.to_csv('ufc_fight_history_extended.csv', index=False)
    print("Saved extended UFC fight data to 'ufc_fight_history_extended.csv'")

Found 732 events.
Processing event 1/732: http://ufcstats.com/event-details/354808cf38d9d73c
Processing event 2/732: http://ufcstats.com/event-details/706404da0775dcbc
Processing event 3/732: http://ufcstats.com/event-details/80dbeb1dd5b53e64
Processing event 4/732: http://ufcstats.com/event-details/6750e338922a099d
Processing event 5/732: http://ufcstats.com/event-details/d6b68eaf4b68b160
Processing event 6/732: http://ufcstats.com/event-details/8ad022dd81224f61
Processing event 7/732: http://ufcstats.com/event-details/f62850b3c7480db9
Processing event 8/732: http://ufcstats.com/event-details/4604ab1de9058474
Processing event 9/732: http://ufcstats.com/event-details/669a3cb6e394f515
Processing event 10/732: http://ufcstats.com/event-details/c0c1bc0766df4c00
Processing event 11/732: http://ufcstats.com/event-details/df85d6ec3493d120
Processing event 12/732: http://ufcstats.com/event-details/18f5669a92e99d92
Processing event 13/732: http://ufcstats.com/event-details/371a1c91b24dec2b
Pro

In [55]:
df.head()

Unnamed: 0,Fighter A,Fighter B,Winner,Weight Class,Method,Round,Time,Event Date,Location,Event Name,...,winner,loser,overall_stats,per_round_stats,sig_strike_breakdown,per_round_sig_strikes,round,time,time format,referee
0,Conor McGregor,Jose Aldo,Unknown,Featherweight,KO/TKOPunch,1,0:13,"December 12, 2015","Las Vegas, Nevada, USA",UFC 194: Aldo vs McGregor,...,Conor McGregor,Jose Aldo,"{'fighter_1': {'KD': '0', 'Sig. str.': '1 of 3...","[{'round': 'Round 1', 'fighter_1': ['Jose Aldo...","{'fighter_1': {'Sig. str': '1 of 3', 'Sig. str...","[{'round': 'Round 1', 'fighter_1': ['Jose Aldo...",1,0:13,5 Rnd (5-5-5-5-5),John McCarthy
1,Luke Rockhold,Chris Weidman,Unknown,Middleweight,KO/TKOPunches,4,3:12,"December 12, 2015","Las Vegas, Nevada, USA",UFC 194: Aldo vs McGregor,...,Luke Rockhold,Chris Weidman,"{'fighter_1': {'KD': '0', 'Sig. str.': '48 of ...","[{'round': 'Round 1', 'fighter_1': ['Chris Wei...","{'fighter_1': {'Sig. str': '48 of 75', 'Sig. s...","[{'round': 'Round 1', 'fighter_1': ['Chris Wei...",4,3:12,5 Rnd (5-5-5-5-5),Herb Dean
2,Yoel Romero,Jacare Souza,Unknown,Middleweight,S-DEC,3,5:00,"December 12, 2015","Las Vegas, Nevada, USA",UFC 194: Aldo vs McGregor,...,Yoel Romero,Jacare Souza,"{'fighter_1': {'KD': '0', 'Sig. str.': '30 of ...","[{'round': 'Round 1', 'fighter_1': ['Jacare So...","{'fighter_1': {'Sig. str': '30 of 52', 'Sig. s...","[{'round': 'Round 1', 'fighter_1': ['Jacare So...",3,5:00,3 Rnd (5-5-5),Marc Goddard
3,Demian Maia,Gunnar Nelson,Unknown,Welterweight,U-DEC,3,5:00,"December 12, 2015","Las Vegas, Nevada, USA",UFC 194: Aldo vs McGregor,...,Demian Maia,Gunnar Nelson,"{'fighter_1': {'KD': '0', 'Sig. str.': '47 of ...","[{'round': 'Round 1', 'fighter_1': ['Demian Ma...","{'fighter_1': {'Sig. str': '47 of 58', 'Sig. s...","[{'round': 'Round 1', 'fighter_1': ['Demian Ma...",3,5:00,3 Rnd (5-5-5),John McCarthy
4,Max Holloway,Jeremy Stephens,Unknown,Featherweight,U-DEC,3,5:00,"December 12, 2015","Las Vegas, Nevada, USA",UFC 194: Aldo vs McGregor,...,Max Holloway,Jeremy Stephens,"{'fighter_1': {'KD': '0', 'Sig. str.': '57 of ...","[{'round': 'Round 1', 'fighter_1': ['Max Hollo...","{'fighter_1': {'Sig. str': '57 of 99', 'Sig. s...","[{'round': 'Round 1', 'fighter_1': ['Max Hollo...",3,5:00,3 Rnd (5-5-5),


# Parse output into table - Overall

In [101]:
import pandas as pd
import re

def parse_strike_stat(stat_str):
    """Convert 'X of Y' string into (X, Y) integers."""
    if isinstance(stat_str, str):
        match = re.match(r'(\d+)\s*of\s*(\d+)', stat_str)
        if match:
            return int(match.group(1)), int(match.group(2))
    return None, None

def clean_percentage_column(series):
    return (
        series.replace('---', None)
              .str.replace('%', '', regex=False)
              .astype(float)
    )

parsed_rows = []

for _, row in df.iterrows():
    overall_stats = row['overall_stats']
    sig_breakdown = row['sig_strike_breakdown']

    if isinstance(overall_stats, dict) and isinstance(sig_breakdown, dict):
        # Event and metadata
        common_fields = {
            'weight_class': row['Weight Class'],
            'method': row['Method'],
            'round': row['Round'],
            'time': row['Time'],
            'event_date': row['Event Date'],
            'location': row['Location'],
            'event_name': row['Event Name'],
            'event_id': row['Event ID'],
            'time format': row['time format'],
            'referee': row['referee'],
        }

        for fighter_key, opponent_key in [('fighter_1', 'fighter_2'), ('fighter_2', 'fighter_1')]:
            stats = overall_stats.get(fighter_key, {})
            breakdown = sig_breakdown.get(fighter_key, {})

            # Overall stats
            sig_landed, sig_attempted = parse_strike_stat(stats.get('Sig. str.'))
            total_landed, total_attempted = parse_strike_stat(stats.get('Total str.'))
            td_landed, td_attempted = parse_strike_stat(stats.get('Td'))

            # Sig strike breakdown
            head_landed, head_attempted = parse_strike_stat(breakdown.get('Head'))
            body_landed, body_attempted = parse_strike_stat(breakdown.get('Body'))
            leg_landed, leg_attempted = parse_strike_stat(breakdown.get('Leg'))
            dist_landed, dist_attempted = parse_strike_stat(breakdown.get('Distance'))
            clinch_landed, clinch_attempted = parse_strike_stat(breakdown.get('Clinch'))
            ground_landed, ground_attempted = parse_strike_stat(breakdown.get('Ground'))

            # Normalize and create unique fight ID
            fighter_1_name = row['fighter_1_name'].strip()
            fighter_1 = fighter_1_name.lower().replace(" ", "_")
            fighter_2_name = row['fighter_2_name'].strip()
            fighter_2 = fighter_2_name.lower().replace(" ", "_")
            event_id = row['Event ID']
            fight_id = f"{event_id}_{fighter_1}_vs_{fighter_2}"
            fight_name = f"{fighter_1_name} vs {fighter_2_name}"

            parsed_row = {
                **common_fields,
                'fight_id': fight_id,
                'fight_name': fight_name,
                'fighter_name': row[f'{fighter_key}_name'],
                'opponent_name': row[f'{opponent_key}_name'],
                'is_winner': row['winner'] == row[f'{fighter_key}_name'],
                'knockdowns': int(stats.get('KD', 0)),
                'submission_attempts': int(stats.get('Sub. att', 0)),
                'reversals': int(stats.get('Rev.', 0)),
                'control_time': stats.get('Ctrl', '0:00'),  # Still a string like "0:02"
                'sig_str_landed': sig_landed,
                'sig_str_attempted': sig_attempted,
                'total_str_landed': total_landed,
                'total_str_attempted': total_attempted,
                'td_landed': td_landed,
                'td_attempted': td_attempted,
                'sig_str_pct': stats.get('Sig. str. %'),
                'td_pct': stats.get('Td %'),
                'head_str_landed': head_landed,
                'head_str_attempted': head_attempted,
                'body_str_landed': body_landed,
                'body_str_attempted': body_attempted,
                'leg_str_landed': leg_landed,
                'leg_str_attempted': leg_attempted,
                'distance_str_landed': dist_landed,
                'distance_str_attempted': dist_attempted,
                'clinch_str_landed': clinch_landed,
                'clinch_str_attempted': clinch_attempted,
                'ground_str_landed': ground_landed,
                'ground_str_attempted': ground_attempted,
                'sig_str_pct_detail': breakdown.get('Sig. str. %')
            }         

            parsed_rows.append(parsed_row)

# Final fighter-level DataFrame
fighter_df = pd.DataFrame(parsed_rows)

# Float transformations
fighter_df['td_pct'] = clean_percentage_column(fighter_df['td_pct'])
fighter_df['sig_str_pct'] = clean_percentage_column(fighter_df['sig_str_pct'])
fighter_df['sig_str_pct_detail'] = clean_percentage_column(fighter_df['sig_str_pct_detail'])

In [108]:
fighter_df.to_csv(r'overall_fight_data.csv', index=False)

# Parse Per round data

In [None]:
def parse_strike_stat(stat_str):
    if isinstance(stat_str, str):
        match = re.match(r'(\d+)\s*of\s*(\d+)', stat_str)
        if match:
            return int(match.group(1)), int(match.group(2))
    return None, None

def clean_percentage_column(series):
    return (
        series.replace('---', None)
              .str.replace('%', '', regex=False)
              .astype(float)
    )
    
def parse_per_round_data(per_round_stats, per_round_sig_strikes, fight_metadata):
    rows = []

    for round_stats, round_sig in zip(per_round_stats, per_round_sig_strikes):
        round_num = round_stats.get('round', '').replace('Round ', '')

        for fighter_key in ['fighter_1', 'fighter_2']:
            stats = round_stats[fighter_key]
            sigs = round_sig[fighter_key]

            fighter_name = stats[0]

            row = {
                **fight_metadata,
                'fighter_name': fighter_name,
                'round_number': int(round_num)
            }

            # From per_round_stats
            row['knockdowns'] = int(stats[1])
            row['sig_str_landed'], row['sig_str_attempted'] = parse_strike_stat(stats[2])
            row['sig_str_pct'] = stats[3]
            row['total_str_landed'], row['total_str_attempted'] = parse_strike_stat(stats[4])
            row['td_landed'], row['td_attempted'] = parse_strike_stat(stats[5])
            row['td_pct'] = stats[6]
            row['submission_attempts'] = int(stats[7])
            row['reversals'] = int(stats[8])
            row['control_time'] = stats[9]

            # From per_round_sig_strikes
            row['sig_str_landed_detail'], row['sig_str_attempted_detail'] = parse_strike_stat(sigs[1])
            row['sig_str_pct_detail'] = sigs[2]

            row['head_str_landed'], row['head_str_attempted'] = parse_strike_stat(sigs[3])
            row['body_str_landed'], row['body_str_attempted'] = parse_strike_stat(sigs[4])
            row['leg_str_landed'], row['leg_str_attempted'] = parse_strike_stat(sigs[5])
            row['distance_str_landed'], row['distance_str_attempted'] = parse_strike_stat(sigs[6])
            row['clinch_str_landed'], row['clinch_str_attempted'] = parse_strike_stat(sigs[7])
            row['ground_str_landed'], row['ground_str_attempted'] = parse_strike_stat(sigs[8])

            rows.append(row)

    return rows

In [99]:
all_round_rows = []

for _, row in df.iterrows():
    # Safeguard: ensure the per-round fields are proper lists
    if not isinstance(row['per_round_stats'], list) or not isinstance(row['per_round_sig_strikes'], list):
        continue
    
    # Normalize and create unique fight ID
    fighter_1_name = row['fighter_1_name'].strip()
    fighter_1 = fighter_1_name.lower().replace(" ", "_")
    fighter_2_name = row['fighter_2_name'].strip()
    fighter_2 = fighter_2_name.lower().replace(" ", "_")
    event_id = row['Event ID']
    fight_id = f"{event_id}_{fighter_1}_vs_{fighter_2}"
    fight_name = f"{fighter_1_name} vs {fighter_2_name}"

    # Extract metadata from the current fight row
    fight_metadata = {
        'event_id': row.get('Event ID'),
        'event_name': row.get('Event Name'),
        'event_date': row.get('Event Date'),
        'location': row.get('Location'),
        'weight_class': row.get('Weight Class'),
        'method': row.get('Method'),
        'winner': row.get('winner'),
        'loser': row.get('loser'),
        'round': row.get('Round'),
        'time': row.get('Time'),
        'referee': row.get('referee'),
        'fighter_1_name': row.get('fighter_1_name'),
        'fighter_2_name': row.get('fighter_2_name'),
        'fight_id': fight_id,
        'fight_name': fight_name
    }

    # Parse the round-by-round stats into rows
    round_rows = parse_per_round_data(row['per_round_stats'], row['per_round_sig_strikes'], fight_metadata)

    # Collect all rows
    all_round_rows.extend(round_rows)

# Final DataFrame
df_rounds = pd.DataFrame(all_round_rows)

# Float transformations
df_rounds['td_pct'] = clean_percentage_column(df_rounds['td_pct'])
df_rounds['sig_str_pct'] = clean_percentage_column(df_rounds['sig_str_pct'])
df_rounds['sig_str_pct_detail'] = clean_percentage_column(df_rounds['sig_str_pct_detail'])


In [104]:
df_rounds.to_csv('round_data.csv', index=False)