# Web Scraping - UFC.com

## Notebook Setup

In [1]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint

## Get Event URLs

In [2]:
# UFC Completed events URL to be scraped
events_completed_url = "http://www.ufcstats.com/statistics/events/completed?page=all"

In [3]:
def get_table_body(url: str):
    """Send get request to url to get html text and find the table on the webpage."""
    
    # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')

    # Find the table of data on the page 
    table_body = soup.find('table')
    
    return table_body, soup

In [30]:
# Get the table HTML from the webpage
table_body, _ = get_table_body(events_completed_url)

In [35]:
def get_event_links(table_body):
    """Create a list of web links to all completed UFC events."""
    
    event_links = []

    for row in table_body.find_all('tr'):
        cols = row.find_all('td')
        for col in cols:
            a_tag = col.find('a')
            if a_tag:
                event = a_tag['href']
                event_links.append(event)
                
    return event_links

In [37]:
# Create list of web links
event_links = get_event_links(table_body)
len(event_links)

641

## Get Event Details

In [42]:
# URL to be scraped --> Will be loop
event_link = event_links[1]

In [47]:
# Get the table HTML from the webpage
table_body, soup = get_table_body(event_link)
print(table_body)

<table class="b-fight-details__table b-fight-details__table_style_margin-top b-fight-details__table_type_event-details js-fight-table" style="display: table;">
<thead class="b-fight-details__table-head">
<tr class="b-fight-details__table-row">
<th class="b-fight-details__table-col">
    W/L
  </th>
<th class="b-fight-details__table-col l-page_align_left">
    Fighter
  </th>
<th class="b-fight-details__table-col">
    Kd
  </th>
<th class="b-fight-details__table-col">
    Str
  </th>
<th class="b-fight-details__table-col">
    Td
  </th>
<th class="b-fight-details__table-col">
    Sub
  </th>
<th class="b-fight-details__table-col l-page_align_left">
    Weight class
  </th>
<th class="b-fight-details__table-col l-page_align_left">
    Method
  </th>
<th class="b-fight-details__table-col">
    Round
  </th>
<th class="b-fight-details__table-col">
    Time
  </th>
</tr>
</thead>
<tbody class="b-fight-details__table-body">
<tr class="b-fight-details__table-row b-fight-details__table-row__

In [45]:
row_data = []

for row in table_body.find_all('tr'):
    col = row.find_all('td')
    col = [ele.text.strip() for ele in col]
    print(col) 

[]
['win', 'Cory Sandhagen\n            \n\n\n\n              Marlon Vera', '0\n          \n\n            \n            0', '128\n\n          \n\n\n            \n            58', '3\n          \n\n            \n            0', '0\n          \n\n            \n            1', 'Bantamweight', 'S-DEC', '5', '5:00']
['win', 'Holly Holm\n            \n\n\n\n              Yana Santos', '0\n          \n\n            \n            0', '32\n\n          \n\n\n            \n            21', '4\n          \n\n            \n            0', '0\n          \n\n            \n            0', "Women's Bantamweight", 'U-DEC', '3', '5:00']
['win', 'Nate Landwehr\n            \n\n\n\n              Austin Lingo', '0\n          \n\n            \n            0', '64\n\n          \n\n\n            \n            48', '1\n          \n\n            \n            0', '1\n          \n\n            \n            0', 'Featherweight', 'SUB\n\n      \n\n        Rear Naked Choke', '2', '4:11']
['win', 'Maycee Barber\n    

In [50]:
# Find the 'td' element containing the names
names_td = soup.select_one('.l-page_align_left[style="width:100px"]')

# Extract the names from the 'a' elements
names = [a.get_text(strip=True) for a in names_td.find_all('a')]

# Find the 'td' elements containing the stats
stats_tds = soup.select('.b-fight-details__table-col:not(.l-page_align_left)')

# Extract the stats from the 'p' elements in each 'td'
stats = [[p.get_text(strip=True) for p in td.find_all('p')] for td in stats_tds]

# Combine the names and stats into a dictionary
data = {}
for i, name in enumerate(names):
    data[name] = [stat[i] if len(stat) > i else '' for stat in stats]

from pprint import pprint
pprint(data)

{'Cory Sandhagen': ['',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    'win',
                    '0',
                    '128',
                    '3',
                    '0',
                    '5',
                    '5:00',
                    'win',
                    '0',
                    '32',
                    '4',
                    '0',
                    '3',
                    '5:00',
                    'win',
                    '0',
                    '64',
                    '1',
                    '1',
                    '2',
                    '4:11',
                    'win',
                    '0',
                    '48',
                    '2',
                    '0',
                    '3',
                    '5:00',
                    'win',
                    '0',
                    '45',
                    '2',
    

In [54]:
# Find all rows
rows = soup.find_all('tr')

# Iterate over each row
results = []
for row in rows:
    row_data = {}
    tds = row.find_all('td')

    # Check if the row has enough columns
    if len(tds) < 10:
        continue
    
    # Extract names
    names_td = tds[1]
    names = [a.get_text(strip=True) for a in names_td.find_all('a')]
    row_data['names'] = names
    
    # Extract stats
    stats_tds = tds[2:6]
    stats = [[p.get_text(strip=True) for p in td.find_all('p')] for td in stats_tds]
    row_data['stats'] = {name: [stat[i] for stat in stats] for i, name in enumerate(names)}

    # Extract weight class
    weight_class_td = tds[6]
    weight_class = weight_class_td.get_text(strip=True)
    row_data['weight_class'] = weight_class

    # Extract method
    method_td = tds[7]
    method = method_td.get_text(strip=True)
    row_data['method'] = method

    # Extract round and time
    round_td, time_td = tds[8:]
    row_data['round'] = round_td.get_text(strip=True)
    row_data['time'] = time_td.get_text(strip=True)
    
    results.append(row_data)

for result in results:
    print(result)

{'names': ['Cory Sandhagen', 'Marlon Vera'], 'stats': {'Cory Sandhagen': ['0', '128', '3', '0'], 'Marlon Vera': ['0', '58', '0', '1']}, 'weight_class': 'Bantamweight', 'method': 'S-DEC', 'round': '5', 'time': '5:00'}
{'names': ['Holly Holm', 'Yana Santos'], 'stats': {'Holly Holm': ['0', '32', '4', '0'], 'Yana Santos': ['0', '21', '0', '0']}, 'weight_class': "Women's Bantamweight", 'method': 'U-DEC', 'round': '3', 'time': '5:00'}
{'names': ['Nate Landwehr', 'Austin Lingo'], 'stats': {'Nate Landwehr': ['0', '64', '1', '1'], 'Austin Lingo': ['0', '48', '0', '0']}, 'weight_class': 'Featherweight', 'method': 'SUBRear Naked Choke', 'round': '2', 'time': '4:11'}
{'names': ['Maycee Barber', 'Andrea Lee'], 'stats': {'Maycee Barber': ['0', '48', '2', '0'], 'Andrea Lee': ['0', '39', '5', '0']}, 'weight_class': "Women's Flyweight", 'method': 'S-DEC', 'round': '3', 'time': '5:00'}
{'names': ['Albert Duraev', 'Chidi Njokuani'], 'stats': {'Albert Duraev': ['0', '45', '2', '0'], 'Chidi Njokuani': ['0'