# Web Scraping - UFC.com

## Notebook Setup

In [1]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint

In [2]:
# Define URLs to be scraped
url_base = "http://www.ufcstats.com/statistics/fighters?char="
url_page = "&page=all"
url = "http://www.ufcstats.com/statistics/fighters?char=a"

## Get Fighter Details

In [3]:
def get_table_body(url: str):
    """Send get request to url to get html text and find the table on the webpage."""
    
    # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')

    # Find the table of data on the page 
    table_body = soup.find('table')
    
    return table_body, soup

In [4]:
def get_fighter_column_headers(url):
    """Get headers for the table on webpage."""
        # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')
    
    column_headers_tag = soup.find('tr', class_= "b-statistics__table-row")
    column_headers = [header.text.strip() for header in column_headers_tag.find_all(['th', 'td'])]    
    return column_headers

In [5]:
def get_fighter_details(table_body: bs):
    """Get fighter details from a single web page."""
    
    # Empty lists to store data
    table_data = []
    current_row = []

    # Find HTML where table data is listed
    for row in table_body.find_all('tr'):
        cols = row.find_all('td')
        # Enumerate through each row/column
        for i, ele in enumerate(cols, start=1):
            col = ele.text.strip()
            current_row.append(col)

            # When i reaches 10, add current row to row_data and reset the current_row list
            if i == 10:
                table_data.append(current_row)
                current_row = []

    # Convert data to a DataFrame using row_data
    page_df = pd.DataFrame(table_data).drop(0, axis=1)
    
    return page_df

In [6]:
def get_all_fighter_details(url_base, url_page):
    # List of letters to be used to sort web page by last name
    letters = [chr(i) for i in range(ord('a'), ord('z') + 1)]

    # Master DataFrame to store all pages fighter data
    master_df = pd.DataFrame()

    # Loop to go through all web pages based on the letter of the last name
    for letter in letters:
        # Set url to be scraped
        page_url = url_base + letter + url_page
        
        # Send get request to url to get html text and find the table on the webpage.
        table_body, _ = get_table_body(page_url)
        
        # Get fighter details from a single web page
        page_df = get_fighter_details(table_body)
        
        # Add page_data to master_df
        master_df = pd.concat([master_df, page_df], ignore_index=True, axis=0)
        
    return master_df

In [7]:
# Run all functions to capture fighter details
fighter_details = get_all_fighter_details(url_base, url_page)

In [8]:
# Add column headers to fighter details dataframe
cols = get_fighter_column_headers(url)
cols = cols[:-1]
fighter_details.columns = cols

In [148]:
# Display the shape and first 5 rows of compiled fighter dataframe
print(fighter_details.shape)
fighter_details.head()

(3953, 10)


Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0


In [12]:
# Save fighter details dataframe to a pickle file            
fighter_details.to_pickle('/Users/nathananderson/Documents/Data_Science/WebScraping/git/Web_Scraping/Pickle Files/fighter_details.pkl')

## Get Fighter URLs

In [158]:
def get_fighter_links(url):
    """Create a list of web links for all UFC fighters."""
    
    # Send get request to url to get html text and find the table on the webpage.
    table_body, _ = get_table_body(url)
    
    # Store captured data
    fighter_names = []
    fighter_links = []
    name_and_link = {}

    # Loop through all 'tr' in the table body
    for row in table_body.find_all('tr'):
        # Find all 'td' for the table
        cols = row.find_all('td')
        
        # Temporary storage for names and links
        temp_fighter_name = []
        fighter_link = None
        
        # Find all 'a' in the table
        for col in cols:
            a_tag = col.find('a')
            
            # Determine what to do with the 'a' tag info
            if a_tag:
                # Get fighter link
                fighter_link = a_tag['href']
                # Get fighter name
                fighter_name = a_tag.get_text()
                temp_fighter_name.append(fighter_name)

        # Only keep first two names - drop nickname
        joined_name = ' '.join(temp_fighter_name[0:2])

        # Add joined_name and its link to name_and_link
        if joined_name not in name_and_link and fighter_link is not None:
            name_and_link[joined_name] = fighter_link
            fighter_names.append(joined_name)
            fighter_links.append(fighter_link)
                
    fighter_names_filtered = [item for item in fighter_names if item != '']
        
    return fighter_links, fighter_names_filtered, name_and_link

In [159]:
fighter_links, fighter_names_filtered, name_and_link = get_fighter_links(url)

In [162]:
name_and_link

{'Tom Aaron': 'http://www.ufcstats.com/fighter-details/93fe7332d16c6ad9',
 'Danny Abbadi': 'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
 'Nariman Abbasov': 'http://www.ufcstats.com/fighter-details/59a9d6dac61c2540',
 'David Abbott': 'http://www.ufcstats.com/fighter-details/b361180739bed4b0',
 'Hamdy Abdelwahab': 'http://www.ufcstats.com/fighter-details/3329d692aea4dc28',
 'Shamil Abdurakhimov': 'http://www.ufcstats.com/fighter-details/2f5cbecbbe18bac4',
 'Hiroyuki Abe': 'http://www.ufcstats.com/fighter-details/c0ed7b208197e8de',
 'Daichi Abe': 'http://www.ufcstats.com/fighter-details/5140122c3eecd307',
 'Papy Abedi': 'http://www.ufcstats.com/fighter-details/c9f6385af6df66d7',
 'Ricardo Abreu': 'http://www.ufcstats.com/fighter-details/aa6e591c2a2cdecd',
 'Klidson Abreu': 'http://www.ufcstats.com/fighter-details/7279654c7674cd24',
 'Daniel Acacio': 'http://www.ufcstats.com/fighter-details/1c5879330d42255f',
 'John Adajar': 'http://www.ufcstats.com/fighter-details/989b85f65

In [167]:
def get_all_fighter_links(url_base, url_page):
    # List of letters to be used to sort web page by last name
    letters = [chr(i) for i in range(ord('a'), ord('z') + 1)]

    # Master DataFrame to store all pages fighter data
    master_links = []

    # Loop to go through all web pages based on the letter of the last name
    for letter in letters:
        # Set url to be scraped
        page_url = url_base + letter + url_page
        
        # Get fighter details from a single web page
        page_links, _ , _ = get_fighter_links(page_url)
        
        # Add page_data to master_df
        master_links = master_links + page_links
        
    return master_links

In [168]:
all_fighter_links = get_all_fighter_links(url_base, url_page)

In [173]:
len(all_fighter_links)

3948

## Get Advanced Fighter Details

In [177]:
print(all_fighter_links[1])

http://www.ufcstats.com/fighter-details/15df64c02b6b0fde


In [None]:
def get_advanced_fighter_details(url):

    # Send get request to url to get html text and find the table/or entire html text on the webpage.
    table_body, _ = get_table_body(url)
    
    return table_body