In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime
import json

def get_fighter_links(): # gets the links to each fighters profile
    base_url = 'http://ufcstats.com/statistics/fighters?'
    all_links = set()
    
    for char in 'abcdefghijklmnopqrstuvwxyz':
        url = f"{base_url}char={char}&page=all"
        try:
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            soup = BeautifulSoup(response.content, 'lxml')
            
            # Target the specific links in table rows
            for row in soup.select('tr.b-statistics__table-row'):
                link_tag = row.find('a', class_='b-link b-link_style_black')
                if link_tag and link_tag.has_attr('href'):
                    all_links.add(link_tag['href'])
                    
        except Exception as e:
            print(f"Error processing {char.upper()}: {str(e)}")
    
    return list(all_links)

# Usage
fighter_links = get_fighter_links()
print(f"Found {len(fighter_links)} unique fighter profiles")


Found 4320 unique fighter profiles


In [5]:
def parse_fighter_page(url): # scrapes the data needed from each fighters profile
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.content, 'lxml')
        
        data = {
            'name': extract_name(soup),
            'record': extract_record(soup),
            **extract_physical_stats(soup),
            **extract_performance_stats(soup)
        }
        return data
        
    except Exception as e:
        print(f"Error parsing {url}: {str(e)}")
        return None

def extract_name(soup):
    name_tag = soup.find('span', class_='b-content__title-highlight')
    return name_tag.get_text(strip=True) if name_tag else 'Missing'

def extract_record(soup):
    record_tag = soup.find('span', class_='b-content__title-record')
    return record_tag.get_text(strip=True).replace('Record: ', '') if record_tag else 'Missing'

def extract_physical_stats(soup):
    stats = {
        'height': 'Missing',
        'weight': 'Missing',
        'reach': 'Missing',
        'stance': 'Missing',
        'dob': 'Missing'
    }
    
    items = soup.find_all('li', class_='b-list__box-list-item_type_block')
    for item in items:
        title = item.find('i').get_text(strip=True) if item.find('i') else ''
        value = item.get_text(strip=True).replace(title, '') if title else item.get_text(strip=True)
        
        if 'Height:' in title:
            stats['height'] = value
        elif 'Weight:' in title:
            stats['weight'] = value
        elif 'Reach:' in title:
            stats['reach'] = value
        elif 'STANCE:' in title:
            stats['stance'] = value
        elif 'DOB:' in title:
            stats['dob'] = value
            
    return stats

def extract_performance_stats(soup):
    metrics = {
        'SLpM': 'Missing',
        'Str. Acc.': 'Missing',
        'SApM': 'Missing',
        'Str. Def': 'Missing',
        'TD Avg.': 'Missing',
        'TD Acc.': 'Missing',
        'TD Def.': 'Missing',
        'Sub. Avg.': 'Missing'
    }
    
    items = soup.find_all('li', class_='b-list__box-list-item_type_block')
    for item in items:
        title = item.find('i', class_='b-list__box-item-title_font_lowercase')
        if not title:
            continue
            
        title_text = title.get_text(strip=True).replace(':', '')
        value = item.get_text(strip=True).replace(title_text, '').strip()
        
        if title_text in metrics:
            metrics[title_text] = value
            
    return metrics

# Usage example
fighter_url = "http://ufcstats.com/fighter-details/f923e012414c883e"
print(parse_fighter_page(fighter_url))


{'name': 'Lauren Mueller', 'record': '5-3-0', 'height': '5\' 5"', 'weight': '125 lbs.', 'reach': '67"', 'stance': 'Orthodox', 'dob': 'Nov 15, 1991', 'SLpM': ':4.08', 'Str. Acc.': ':41%', 'SApM': ':3.77', 'Str. Def': ':58%', 'TD Avg.': ':0.70', 'TD Acc.': ':50%', 'TD Def.': ':71%', 'Sub. Avg.': ':0.0'}


In [6]:
def scrape_all_fighters_fast(save_interval=50): #scrapes data for all the fighters
    
    # Create output directory if it doesn't exist
    output_dir = "ufc_fighter_data"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get timestamp for filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Get all fighter links
    print("Fetching all fighter profile links...")
    fighter_links = get_fighter_links()
    total_fighters = len(fighter_links)
    print(f"Found {total_fighters} unique fighter profiles")
    
    # Process each fighter
    all_fighter_data = []
    processed = 0
    errors = 0
    
    for i, link in enumerate(fighter_links):
        # Process the fighter
        print(f"\rProcessing fighter {i+1}/{total_fighters}: {link}", end="")
        
        try:
            fighter_data = parse_fighter_page(link)
            if fighter_data:
                # Add URL to the data
                fighter_data['url'] = link
                all_fighter_data.append(fighter_data)
                processed += 1
                
                # Save progress periodically
                if processed % save_interval == 0:
                    interim_filename = f"{output_dir}/ufc_fighters_interim_{timestamp}_{processed}.json"
                    with open(interim_filename, 'w') as f:
                        json.dump(all_fighter_data, f, indent=2)
                    print(f"\nSaved interim data for {processed} fighters to {interim_filename}")
            else:
                errors += 1
                print(f"\nWarning: No data returned for {link}")
        
        except Exception as e:
            errors += 1
            print(f"\nError processing {link}: {str(e)}")
    
    # Save final results
    final_filename = f"{output_dir}/ufc_fighters_complete_{timestamp}.json"
    with open(final_filename, 'w') as f:
        json.dump(all_fighter_data, f, indent=2)
    
    print(f"\nCompleted! Processed {processed} fighters out of {total_fighters} links")
    print(f"Encountered {errors} errors")
    print(f"Data saved to {final_filename}")
    
    return all_fighter_data

# Run the fast scraper
print("UFC Fighter Data Scraper starting...")
all_data = scrape_all_fighters_fast()
print("Scraping completed!")
df = pd.DataFrame(all_data)
df.to_csv('ufc_fighters_data.csv', index=False)


UFC Fighter Data Scraper starting...
Fetching all fighter profile links...
Found 4320 unique fighter profiles
Processing fighter 50/4320: http://ufcstats.com/fighter-details/402ea3b5ee233852
Saved interim data for 50 fighters to ufc_fighter_data/ufc_fighters_interim_20250508_220431_50.json
Processing fighter 100/4320: http://ufcstats.com/fighter-details/4d5197f331c47290
Saved interim data for 100 fighters to ufc_fighter_data/ufc_fighters_interim_20250508_220431_100.json
Processing fighter 150/4320: http://ufcstats.com/fighter-details/f626118b6da0e020
Saved interim data for 150 fighters to ufc_fighter_data/ufc_fighters_interim_20250508_220431_150.json
Processing fighter 200/4320: http://ufcstats.com/fighter-details/f9a20a17d712ef7c
Saved interim data for 200 fighters to ufc_fighter_data/ufc_fighters_interim_20250508_220431_200.json
Processing fighter 250/4320: http://ufcstats.com/fighter-details/5eedbf1e9601be35
Saved interim data for 250 fighters to ufc_fighter_data/ufc_fighters_interi