# Web Scraping - UFC.com

## Notebook Setup

In [1]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint

In [2]:
# Define URLs to be scraped
url_base = "http://www.ufcstats.com/statistics/fighters?char="
url_page = "&page=all"
url = "http://www.ufcstats.com/statistics/fighters?char=a"

## Get Fighter Details

In [3]:
def get_table_body(url: str):
    """Send get request to url to get html text and find the table on the webpage."""
    
    # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')

    # Find the table of data on the page 
    table_body = soup.find('table')
    
    return table_body, soup

In [4]:
def get_fighter_column_headers(url):
    """Get headers for the table on webpage."""
        # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')
    
    column_headers_tag = soup.find('tr', class_= "b-statistics__table-row")
    column_headers = [header.text.strip() for header in column_headers_tag.find_all(['th', 'td'])]    
    return column_headers

In [5]:
def get_fighter_details(table_body: bs):
    """Get fighter details from a single web page."""
    
    # Empty lists to store data
    table_data = []
    current_row = []

    # Find HTML where table data is listed
    for row in table_body.find_all('tr'):
        cols = row.find_all('td')
        # Enumerate through each row/column
        for i, ele in enumerate(cols, start=1):
            col = ele.text.strip()
            current_row.append(col)

            # When i reaches 10, add current row to row_data and reset the current_row list
            if i == 10:
                table_data.append(current_row)
                current_row = []

    # Convert data to a DataFrame using row_data
    page_df = pd.DataFrame(table_data).drop(0, axis=1)
    
    return page_df

In [6]:
def get_all_fighter_details(url_base, url_page):
    # List of letters to be used to sort web page by last name
    letters = [chr(i) for i in range(ord('a'), ord('z') + 1)]

    # Master DataFrame to store all pages fighter data
    master_df = pd.DataFrame()

    # Loop to go through all web pages based on the letter of the last name
    for letter in letters:
        # Set url to be scraped
        page_url = url_base + letter + url_page
        
        # Send get request to url to get html text and find the table on the webpage.
        table_body, _ = get_table_body(page_url)
        
        # Get fighter details from a single web page
        page_df = get_fighter_details(table_body)
        
        # Add page_data to master_df
        master_df = pd.concat([master_df, page_df], ignore_index=True, axis=0)
        
    return master_df

In [7]:
# Run all functions to capture fighter details
fighter_details = get_all_fighter_details(url_base, url_page)

In [8]:
# Add column headers to fighter details dataframe
cols = get_fighter_column_headers(url)
cols = cols[:-1]
fighter_details.columns = cols

In [9]:
# Display the shape and first 5 rows of compiled fighter dataframe
print(fighter_details.shape)
fighter_details.tail()

(3953, 10)


Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D
3948,Dave,Zitanick,,--,170 lbs.,--,,5,7,0
3949,Alex,Zuniga,,--,145 lbs.,--,,6,3,0
3950,George,Zuniga,,"5' 9""",185 lbs.,--,,3,1,0
3951,Allan,Zuniga,Tigre,"5' 7""",155 lbs.,"70.0""",Orthodox,13,1,0
3952,Virgil,Zwicker,RezDog,"6' 2""",205 lbs.,"74.0""",,15,6,1


In [12]:
# Save fighter details dataframe to a pickle file            
fighter_details.to_pickle('/Users/nathananderson/Documents/Data_Science/WebScraping/git/Web_Scraping/Pickle Files/fighter_details.pkl')

## Get Fighter URLs