# Web Scraping - UFC.com

## Notebook Setup

In [170]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint

## Get Event URLs

In [171]:
# UFC Completed events URL to be scraped
events_completed_url = "http://www.ufcstats.com/statistics/events/completed?page=all"

In [172]:
def get_table_body(url: str):
    """Send get request to url to get html text and find the table on the webpage."""
    
    # Send get request to URL provided
    page = requests.get(url)

    # Return the html text of the page
    soup = bs(page.text, 'lxml')

    # Find the table of data on the page 
    table_body = soup.find('table')
    
    return table_body, soup

In [173]:
# Get the table HTML from the webpage
table_body, _ = get_table_body(events_completed_url)

In [174]:
def get_event_links(table_body):
    """Create a list of web links to all completed UFC events."""
    
    event_links = []

    for row in table_body.find_all('tr'):
        cols = row.find_all('td')
        for col in cols:
            a_tag = col.find('a')
            if a_tag:
                event = a_tag['href']
                event_links.append(event)
                
    return event_links

In [175]:
# Create list of web links
event_links = get_event_links(table_body)
len(event_links)

641

## Get Event Details

In [220]:
def get_all_events_details(event_links):
    """"""
    results = {}
    # Completed UFC events to be scraped
    # Skip the first event because it is actually the next upcoming event
    for event_link in event_links[1:]: 

        # Get the table HTML from the webpage
        table_body, soup = get_table_body(event_link)

        # Find all rows
        rows = soup.find_all('tr')

        # Iterate over each row
        for row in rows:
            row_data = {}
            
            # Extract Event Name
            event_name_tag = soup.find('h2')
            event_name = event_name_tag.get_text(strip=True)
            row_data['event_name'] = event_name
            
            # Extract Event Date & Location
            for item in soup.find_all('li', {'class': 'b-list__box-list-item'}):
                title = item.find('i', {'class': 'b-list__box-item-title'})
                if title:
                    title_text = title.get_text(strip=True).lower()
                    if 'date' in title_text:
                        date = item.get_text(strip=True).replace('Date:', '').strip()
                    elif 'location' in title_text:
                        location = item.get_text(strip=True).replace('Location:', '').strip()

            row_data['event_details'] = {'date': date,
                                        'location': location}
            
            # Find all rows in table
            tds = row.find_all('td')

            # Check if the row has enough columns
            if len(tds) < 10:
                continue
            
            # Extract names
            names_td = tds[1]
            names = [a.get_text(strip=True) for a in names_td.find_all('a')]
            row_data['names'] = names
            
            # Extract Winner and Loser
            win_text = tds[0].text.strip()
            lose_text = 'lose'
            winner_text = [win_text, lose_text]
            
            row_data['result'] = {}
            for i, name in enumerate(names):
                row_data['result'].update({name: winner_text[i]})
            
            # Extract stats
            stats_tds = tds[2:6]
            stats = [[p.get_text(strip=True) for p in td.find_all('p')] for td in stats_tds]
            row_data['stats'] = {name: [stat[i] for stat in stats] for i, name in enumerate(names)}

            # Extract weight class
            weight_class_td = tds[6]
            weight_class = weight_class_td.get_text(strip=True)
            row_data['weight_class'] = weight_class

            # Extract method
            method_td = tds[7]
            method = method_td.get_text(strip=True)
            row_data['method'] = method

            # Extract round and time
            round_td, time_td = tds[8:]
            row_data['round'] = round_td.get_text(strip=True)
            row_data['time'] = time_td.get_text(strip=True)
            
            #results.append(row_data) # when results was a list
            
            # Check if the event_link key exists in results, and if not, create it with an empty list as its value
            if event_link not in results:
                results[event_link] = []
            
            # Append row_data to the list associated with the event_link key in the results dictionary
            results[event_link].append(row_data)
            
    return results

In [246]:
all_event_details = get_all_events_details(event_links)

In [243]:
# Conver dictionary of all event details to a dataframe
def flatten_nested_dict(data):
    flattened_data = []

    for event_link, event_rows in data.items():
        for row in event_rows:
            flat_row = {
                'event_link': event_link,
                'event_name': row['event_name'],
                'date': row['event_details']['date'],
                'location': row['event_details']['location'],
                'names': row['names'],
                'result': row['result'],
                'stats': row['stats'],
                'weight_class': row['weight_class'],
                'method': row['method'],
                'round': row['round'],
                'time': row['time']
            }
            flattened_data.append(flat_row)

    return flattened_data

flattened_data = flatten_nested_dict(all_event_details)
df = pd.DataFrame(flattened_data)

In [244]:
print(df.shape)
df.head()

Unnamed: 0,event_link,event_name,date,location,names,result,stats,weight_class,method,round,time
0,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA","[Cory Sandhagen, Marlon Vera]","{'Cory Sandhagen': 'win', 'Marlon Vera': 'lose'}","{'Cory Sandhagen': ['0', '128', '3', '0'], 'Ma...",Bantamweight,S-DEC,5,5:00
1,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA","[Holly Holm, Yana Santos]","{'Holly Holm': 'win', 'Yana Santos': 'lose'}","{'Holly Holm': ['0', '32', '4', '0'], 'Yana Sa...",Women's Bantamweight,U-DEC,3,5:00
2,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA","[Nate Landwehr, Austin Lingo]","{'Nate Landwehr': 'win', 'Austin Lingo': 'lose'}","{'Nate Landwehr': ['0', '64', '1', '1'], 'Aust...",Featherweight,SUBRear Naked Choke,2,4:11
3,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA","[Maycee Barber, Andrea Lee]","{'Maycee Barber': 'win', 'Andrea Lee': 'lose'}","{'Maycee Barber': ['0', '48', '2', '0'], 'Andr...",Women's Flyweight,S-DEC,3,5:00
4,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA","[Albert Duraev, Chidi Njokuani]","{'Albert Duraev': 'win', 'Chidi Njokuani': 'lo...","{'Albert Duraev': ['0', '45', '2', '0'], 'Chid...",Middleweight,S-DEC,3,5:00


In [252]:
def split_rows(df):
    # Initialize an empty DataFrame to store the results
    two_row_df = pd.DataFrame()

    # Iterate through the rows of the original DataFrame
    for index, row in df.iterrows():
        # Convert the row into a list of dictionaries
        row_dicts = []
        for name in row['names']:
            row_data = row.copy()
            row_data['name'] = name
            row_data['result'] = row['result'][name]
            row_data['stats'] = row['stats'][name]
            del row_data['names']
            row_dicts.append(row_data.to_dict())

        # Create a new DataFrame from the list of dictionaries
        temp_df = pd.DataFrame(row_dicts)

        # Concatenate the resulting DataFrames
        two_row_df = pd.concat([two_row_df, temp_df], ignore_index=True)

    # Split the 'stats' column into separate columns and update the DataFrame
    two_row_df[['KD', 'STR', 'TD', 'SUB']] = pd.DataFrame(two_row_df['stats'].tolist(), index=two_row_df.index)

    # Drop the 'stats' column
    two_row_df.drop('stats', axis=1, inplace=True)
    
    return two_row_df

In [253]:
two_row_df = split_rows(df)
two_row_df.head()

Unnamed: 0,event_link,event_name,date,location,result,weight_class,method,round,time,name,KD,STR,TD,SUB
0,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA",win,Bantamweight,S-DEC,5,5:00,Cory Sandhagen,0,128,3,0
1,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA",lose,Bantamweight,S-DEC,5,5:00,Marlon Vera,0,58,0,1
2,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA",win,Women's Bantamweight,U-DEC,3,5:00,Holly Holm,0,32,4,0
3,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA",lose,Women's Bantamweight,U-DEC,3,5:00,Yana Santos,0,21,0,0
4,http://www.ufcstats.com/event-details/aec273fc...,UFC Fight Night: Vera vs. Sandhagen,"March 25, 2023","San Antonio, Texas, USA",win,Featherweight,SUBRear Naked Choke,2,4:11,Nate Landwehr,0,64,1,1


## Main Script