# Web Scraping - UFC.com

## Notebook Setup

In [76]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint

In [77]:
# Define URLs to be scraped
url_base = "http://www.ufcstats.com/statistics/fighters?char="
url_page = "&page=all"
url = "http://www.ufcstats.com/statistics/fighters?char=a"

## Create Fighter DataFrame

In [78]:
def get_table_body(url: str):
    """Send get request to url to get html text and find the table on the webpage."""
    
    # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')

    # Find the table of data on the page 
    table_body = soup.find('table')
    
    return table_body, soup

In [79]:
def get_fighter_column_headers(url):
    """Get headers for the table on webpage."""
        # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')
    
    column_headers_tag = soup.find('tr', class_= "b-statistics__table-row")
    column_headers = [header.text.strip() for header in column_headers_tag.find_all(['th', 'td'])]    
    return column_headers

In [80]:
def get_fighter_details(table_body: bs):
    """Get fighter details from a single web page."""
    
    # Empty lists to store data
    table_data = []
    current_row = []

    # Find HTML where table data is listed
    for row in table_body.find_all('tr'):
        cols = row.find_all('td')
        # Enumerate through each row/column
        for i, ele in enumerate(cols, start=1):
            col = ele.text.strip()
            current_row.append(col)

            # When i reaches 10, add current row to row_data and reset the current_row list
            if i == 10:
                table_data.append(current_row)
                current_row = []

    # Convert data to a DataFrame using row_data
    page_df = pd.DataFrame(table_data).drop(0, axis=1)
    
    return page_df

In [85]:
def get_all_fighter_details(url_base, url_page):
    # List of letters to be used to sort web page by last name
    letters = [chr(i) for i in range(ord('a'), ord('z') + 1)]

    # Master DataFrame to store all pages fighter data
    master_df = pd.DataFrame()

    # Loop to go through all web pages based on the letter of the last name
    for letter in letters:
        # Set url to be scraped
        page_url = url_base + letter + url_page
        
        # Send get request to url to get html text and find the table on the webpage.
        table_body, _ = get_table_body(page_url)
        
        # Get fighter details from a single web page
        page_df = get_fighter_details(table_body)
        
        # Add page_data to master_df
        master_df = pd.concat([master_df, page_df], ignore_index=True, axis=0)
        
    return master_df

In [83]:
letter = 'a'
url2 = url_base + letter + url_page
print(url2)

http://www.ufcstats.com/statistics/fighters?char=a&page=all


In [86]:
# Run all functions to capture fighter details
fighter_details = get_all_fighter_details(url_base, url_page)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
# Display the shape and first 5 rows of compiled fighter dataframe
print(fighter_details.shape)
fighter_details.head()

(3953, 10)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0


In [None]:
cols = get_fighter_column_headers(url)
print(cols)

['First', 'Last', 'Nickname', 'Ht.', 'Wt.', 'Reach', 'Stance', 'W', 'L', 'D', 'Belt']


## Create Event DataFrame

In [None]:
url = "http://www.ufcstats.com/statistics/events/completed?page=all"

In [None]:
# Create list of event web page links


In [None]:
page = requests.get(url)

# Return the html text of the page
soup = bs(page.text, 'lxml')

column_headers_tag = soup.find('tr', class_= "b-statistics__table-row")
column_headers = [header.text.strip() for header in column_headers_tag.find_all(['th', 'td'])]

column_headers[:-1]

['First', 'Last', 'Nickname', 'Ht.', 'Wt.', 'Reach', 'Stance', 'W', 'L', 'D']

In [None]:
print(url)

http://www.ufcstats.com/statistics/fighters?char=a


In [None]:
print(url_base)

http://www.ufcstats.com/statistics/fighters?char=
