# Scrape UFCStats.com

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
def get_event_links():
    url = 'http://ufcstats.com/statistics/events/completed?page=all'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    event_links = []
    for a in soup.select('a.b-link.b-link_style_black'):
        href = a.get('href')
        if href and 'event-details' in href:
            event_links.append(href)
    
    return list(set(event_links))  # Remove duplicates

# Test it
event_urls = get_event_links()
print(f"Collected {len(event_urls)} event URLs.")

Collected 724 event URLs.


In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_event_links():
    """
    Scrape the page with all completed events and return a list of event URLs.
    """
    url = 'http://ufcstats.com/statistics/events/completed?page=all'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    event_links = []
    # The event links are found in anchor tags with class 'b-link b-link_style_black'
    for a in soup.select('a.b-link.b-link_style_black'):
        href = a.get('href')
        if href and 'event-details' in href:
            event_links.append(href)
    return list(set(event_links))  # Remove duplicates if any

def get_event_details(soup):
    """
    Given the BeautifulSoup object of an event page, extract event-level details.
    """
    details = {}
    # Assume event date and location are in <li> tags with class 'b-list__box-list-item'
    items = soup.find_all('li', class_='b-list__box-list-item')
    for item in items:
        text = item.get_text(strip=True)
        if 'Date:' in text:
            details['Event Date'] = text.replace('Date:', '').strip()
        elif 'Location:' in text:
            details['Location'] = text.replace('Location:', '').strip()
    return details

def parse_table_dual_rows(table):
    """
    Parses tables where each <td> has two <p> tags representing two fighters.
    Returns a dict of stat_name: (fighter1_val, fighter2_val)
    """
    headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")]
    rows = table.find("tbody").find_all("tr")[0].find_all("td")

    data = {"fighter_1": {}, "fighter_2": {}}
    for i, header in enumerate(headers[1:], 1):
        values = rows[i].find_all("p")
        if len(values) == 2:
            data["fighter_1"][header] = values[0].get_text(strip=True)
            data["fighter_2"][header] = values[1].get_text(strip=True)
    return data


def get_fight_stats(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract fighter names
    fighter_tags = soup.select(".b-fight-details__person-name a")
    if len(fighter_tags) < 2:
        return None
    fighter_1 = fighter_tags[0].get_text(strip=True)
    fighter_2 = fighter_tags[1].get_text(strip=True)

    fight_data = {
        "fighter_1_name": fighter_1,
        "fighter_2_name": fighter_2,
    }

    # Extract metadata: method, round, time, referee, etc.
    metadata_section = soup.select_one(".b-fight-details__content")
    if metadata_section:
        for item in metadata_section.select(".b-fight-details__text-item"):
            label = item.select_one(".b-fight-details__label")
            if label:
                key = label.get_text(strip=True).replace(":", "").lower()
                value = item.get_text(strip=True).replace(label.get_text(strip=True), "").strip()
                fight_data[key] = value

    # Extract fight stats summary table
    tables = soup.find_all("table", class_="b-fight-details__table")
    if len(tables) > 0:
        fight_data["overall_stats"] = parse_table_dual_rows(tables[0])

    # Extract per-round summary stats
    if len(tables) > 1:
        fight_data["per_round_stats"] = {}
        round_tables = soup.select("table.b-fight-details__table.js-fight-table")
        for i, round_table in enumerate(round_tables):
            fight_data["per_round_stats"][f"round_{i+1}"] = parse_table_dual_rows(round_table)
            

    # Extract significant strikes breakdown (overall)
    if len(tables) > 2:
        fight_data["sig_strike_breakdown"] = parse_table_dual_rows(tables[2])

    return fight_data

def get_fights_from_event(event_url):
    """
    Given an event URL, scrape the event page to extract:
      - Basic fight details (fighter names, winner, weight class, method, round, time)
      - Event details (date and location)
      - Detailed fighter statistics from the fight detail page
    """
    fights = []
    try:
        response = requests.get(event_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get event-level details (date and location)
        event_details = get_event_details(soup)
        
        # Locate the fight table on the event page.
        table = soup.find('table', class_='b-fight-details__table')
        if not table:
            return fights

        rows = table.find_all('tr')[1:]  # Skip header row
        for row in rows:
            cols = row.find_all('td')
            # Verify there are enough columns before indexing
            if len(cols) < 10:
                continue

            # Basic fight details (you may need to adjust the indices based on the site’s structure)
            fight = {
                'Fighter A': cols[1].get_text(strip=True),
                'Fighter B': cols[2].get_text(strip=True),
                'Winner': cols[0].get_text(strip=True),
                'Weight Class': cols[6].get_text(strip=True),
                'Method': cols[7].get_text(strip=True),
                'Round': cols[8].get_text(strip=True),
                'Time': cols[9].get_text(strip=True),
                'Event Date': event_details.get('Event Date', None),
                'Location': event_details.get('Location', None)
            }
            
            # Extract the fight detail URL – typically found in an <a> tag in the row.
            fight_link_tag = row.find('a')
            #print(f'fight_link_tag:{fight_link_tag}')
            fight_detail_url = fight_link_tag['href'] if fight_link_tag else None
            #print(f'fight_detail_url:{fight_detail_url}')
            if fight_detail_url:
                fight_stats = get_fight_stats(fight_detail_url)
                fight.update(fight_stats)
                time.sleep(1)  # Delay between fight detail requests
            
            fights.append(fight)
    except Exception as e:
        print(f"Error processing event {event_url}: {e}")
    return fights

def scrape_ufc_data(limit_events=None):
    """
    Scrape data for all events (or limit to a set number) and compile fight details.
    """
    all_fights = []
    event_urls = get_event_links()
    print(f"Found {len(event_urls)} events.")
    if limit_events:
        event_urls = event_urls[:limit_events]
    
    for i, event_url in enumerate(event_urls, start=1):
        print(f"Processing event {i}/{len(event_urls)}: {event_url}")
        fights = get_fights_from_event(event_url)
        all_fights.extend(fights)
        time.sleep(2)  # Delay between event requests
    return all_fights

if __name__ == "__main__":
    # For testing purposes, limit to the first 10 events. Remove or adjust 'limit_events' for full scraping.
    fights_data = scrape_ufc_data(limit_events=1)
    df = pd.DataFrame(fights_data)
    df.to_csv('ufc_fight_history_extended.csv', index=False)
    print("Saved extended UFC fight data to 'ufc_fight_history_extended.csv'")

Found 725 events.
Processing event 1/1: http://ufcstats.com/event-details/bd4389b71fdc0ce2
Saved extended UFC fight data to 'ufc_fight_history_extended.csv'
