In [None]:
# Updated as of 9/6/2024
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import csv


# Define headers for the HTTP request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}


def get_page_content(url):
    """
    Retrieve and parse HTML content from a specified URL using a user-defined number of retries.
    Returns a BeautifulSoup object if successful, otherwise None.

    Parameters:
    - url (str): The URL from which to fetch content.

    Returns:
    - BeautifulSoup: Parsed HTML of the page, or None if the request fails after retries.
    """

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return BeautifulSoup(response.content, 'html.parser')
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed: {e}")

    print("All attempts failed. Returning None.")
    return None

def parse_date(date_str):
    """
    Convert a date string into a specific format ('%m-%d-%Y'), unless it is 'N/A'.

    Parameters:
    - date_str (str): The date string to convert.

    Returns:
    - str: The formatted date or 'N/A' if input is 'N/A'.
    """
    if date_str != 'N/A':
        date_obj = datetime.strptime(date_str, '%B %d, %Y')
        return date_obj.strftime('%m-%d-%Y')
    else:
        return 'N/A'

def extract_fighter_info(fighter_row):
    """
    Extracts and returns basic information about two fighters from a table row.

    Parameters:
    - fighter_row (bs4.element.Tag): BeautifulSoup tag containing the row with fighter data.

    Returns:
    - tuple: Two dictionaries containing fighter names, their respective links, and empty rounds info.
    """
    fighters = [fighter.text.strip() for fighter in fighter_row.find_all('a', class_='b-link b-link_style_black')]
    fighter_links = [fighter['href'] for fighter in fighter_row.find_all('a', class_='b-link b-link_style_black')]
    
    return ({'name': fighters[0], 'link': fighter_links[0], 'rounds_a': []}, {'name': fighters[1], 'link': fighter_links[1], 'rounds_b': []})

def convert_to_inches(height_info):
    """
    Convert a height string from the format "feet'inches"" into inches only.

    Parameters:
    - height_info (str): The height string, e.g., "6'0"".

    Returns:
    - dict: Dictionary containing original and converted height information.
    """
    try:
        feet, inches = map(int, height_info.replace('"', '').split("'"))
        total_inches = feet * 12 + inches
        return {'feet_inches': f"{feet}' {inches}\"", 'inches': total_inches}
    except ValueError:
        return {'feet_inches': 'N/A', 'inches': 'N/A'}

def convert_to_numerical_date(date_str):
    """
    Convert a date string into a numerical format ('%m-%d-%Y').

    Parameters:
    - date_str (str): The date string to convert, in the format '%b %d, %Y'.

    Returns:
    - dict: Dictionary containing the converted date or 'N/A' if conversion fails.
    """
    try:
        date_obj = datetime.strptime(date_str, '%b %d, %Y')
        return {'numerical_dob': date_obj.strftime('%m-%d-%Y')}
    except ValueError:
        return {'numerical_dob': 'N/A'}

def extract_strikes_data(row, index):
    """
    Extracts detailed strikes data for a fighter from a specified row of the table.

    Parameters:
    - row (bs4.element.Tag): The BeautifulSoup tag of the row from which to extract data.
    - index (int): The index indicating which fighter's data to extract (0 for fighter_a, 1 for fighter_b).

    Returns:
    - dict: Dictionary containing various types of strikes information.
    """
    # Head Strikes
    head_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(4) p')
    head_strikes = head_strikes_element[index].text.strip() if head_strikes_element else 'N/A'

    # Body Strikes
    body_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(5) p')
    body_strikes = body_strikes_element[index].text.strip() if body_strikes_element else 'N/A'

    # Leg Strikes
    leg_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(6) p')
    leg_strikes = leg_strikes_element[index].text.strip() if leg_strikes_element else 'N/A'

    # Distance Strikes
    distance_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(7) p')
    distance_strikes = distance_strikes_element[index].text.strip() if distance_strikes_element else 'N/A'

    # Clinch Strikes
    clinch_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(8) p')
    clinch_strikes = clinch_strikes_element[index].text.strip() if clinch_strikes_element else 'N/A'

    # Ground Strikes
    ground_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(9) p')
    ground_strikes = ground_strikes_element[index].text.strip() if ground_strikes_element else 'N/A'

    return {
        'head_strikes': head_strikes,
        'body_strikes': body_strikes,
        'leg_strikes': leg_strikes,
        'distance_strikes': distance_strikes,
        'clinch_strikes': clinch_strikes,
        'ground_strikes': ground_strikes,
    }

def extract_fighter_data(row, index):
    """
    Extracts and returns detailed statistics of a fighter from a specific row in the table.

    Parameters:
    - row (bs4.element.Tag): The row from which to extract data.
    - index (int): Index to specify the fighter (0 for the first, 1 for the second).

    Returns:
    - dict: Dictionary containing various statistics for the fighter.
    """
    # Fighter Name
    fighter_name_element = row.select('.b-fight-details__table-col.l-page_align_left a')
    fighter_name = fighter_name_element[index].text.strip() if fighter_name_element else 'N/A'

    # Knockdowns
    knockdowns_element = row.select(f'.b-fight-details__table-col:nth-of-type(2) p')
    knockdowns = knockdowns_element[index].text.strip() if knockdowns_element else 'N/A'

    # Sig. Strikes
    sig_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(3) p')
    sig_strikes = sig_strikes_element[index].text.strip() if sig_strikes_element else 'N/A'

    # Total Strikes
    total_strikes_element = row.select(f'.b-fight-details__table-col:nth-of-type(5) p')
    total_strikes = total_strikes_element[index].text.strip() if total_strikes_element else 'N/A'

    # Takedowns
    takedowns_element = row.select(f'.b-fight-details__table-col:nth-of-type(6) p')
    takedowns = takedowns_element[index].text.strip() if takedowns_element else 'N/A'

    # Sub. Attempts
    sub_attempts_element = row.select(f'.b-fight-details__table-col:nth-of-type(8) p')
    sub_attempts = sub_attempts_element[index].text.strip() if sub_attempts_element else 'N/A'

    # Reversals
    reversals_element = row.select(f'.b-fight-details__table-col:nth-of-type(9) p')
    reversals = reversals_element[index].text.strip() if reversals_element else 'N/A'

    # Control Time
    control_time_element = row.select(f'.b-fight-details__table-col:nth-of-type(10) p')
    control_time = control_time_element[index].text.strip() if control_time_element else 'N/A'

    return {
        'name': fighter_name,
        'knockdowns': knockdowns,
        'sig_strikes': sig_strikes,
        'total_strikes': total_strikes,
        'takedowns': takedowns,
        'sub_attempts': sub_attempts,
        'reversals': reversals,
        'control_time': control_time,
    }

def get_fight_link_and_winner(fight_row):
    """
    Determines the fight link and the winner based on the table row information.
    
    Parameters:
    - fight_row (bs4.element.Tag): BeautifulSoup tag containing the row with fight details.

    Returns:
    - tuple: (fight_link, winner, method_of_victory)
    """
    primary_fight_link_element = fight_row.find('a', class_='b-flag b-flag_style_green')
    
    if primary_fight_link_element:
        fight_link = primary_fight_link_element['href']
        winner = fight_row.find('a', class_='b-link b-link_style_black').text.strip()  # Assuming fighter A is the winner by default
        method_of_victory = 'TBD'
    else:
        alternative_fight_link_element = fight_row.find('a', class_='b-flag b-flag_style_bordered')
        fight_link = alternative_fight_link_element['href'] if alternative_fight_link_element else 'N/A'
        winner = 'N/A'  # No contest or no winner determined
        method_of_victory = 'NC' if fight_link == 'N/A' else 'TBD'
    
    return fight_link, winner, method_of_victory

def extract_round_data(row_fighter_data, row_strikes_data, fighter_a_name, fighter_b_name, fight_info):
    """
    Extracts round-by-round data for both fighters and updates the fight_info dictionary.

    Parameters:
    - row_fighter_data (bs4.element.Tag): The row containing fighter-specific data.
    - row_strikes_data (bs4.element.Tag): The row containing strike-specific data.
    - fighter_a_name (str): Name of Fighter A.
    - fighter_b_name (str): Name of Fighter B.
    - fight_info (dict): The dictionary containing fight information to update.
    """
    fighter_one_data = extract_fighter_data(row_fighter_data, 0)
    fighter_two_data = extract_fighter_data(row_fighter_data, 1)
    fighter_one_strikes_data = extract_strikes_data(row_strikes_data, 0)
    fighter_two_strikes_data = extract_strikes_data(row_strikes_data, 1)

    fighter_one_data.update(fighter_one_strikes_data)
    fighter_two_data.update(fighter_two_strikes_data)

    if fighter_one_data['name'] == fighter_a_name:
        fight_info['fighter_a']['rounds_a'].append(fighter_one_data)
        fight_info['fighter_b']['rounds_b'].append(fighter_two_data)
    else:
        fight_info['fighter_a']['rounds_a'].append(fighter_two_data)
        fight_info['fighter_b']['rounds_b'].append(fighter_one_data)

def extract_victory_and_round_data(fight_soup, fight_info):
    """
    Extracts method of victory, round, time of victory, time format, and referee data from fight page.

    Parameters:
    - fight_soup (BeautifulSoup): Parsed HTML of the fight page.
    - fight_info (dict): Dictionary containing fight information to update.
    """
    method_victory_element = fight_soup.find('i', style='font-style: normal')
    if method_victory_element:
        fight_info['method_of_victory'] = method_victory_element.text.strip()

    round_element = fight_soup.find('i', class_='b-fight-details__text-item')
    if round_element and round_element.contents:
        fight_info['round_of_victory'] = round_element.contents[-1].strip()

        time_element = round_element.find_next('i', class_='b-fight-details__text-item')
        if time_element and time_element.contents:
            fight_info['time_in_ROV'] = time_element.contents[-1].strip()

            format_element = time_element.find_next('i', class_='b-fight-details__text-item')
            if format_element and format_element.contents:
                fight_info['time_format'] = format_element.contents[-1].strip()

                referee_element = format_element.find_next('span', class_='')
                if referee_element:
                    fight_info['referee'] = referee_element.text.strip()

def scrape_fight_info(fight_info):
    """
    Retrieves detailed fight information including rounds and results from the fight page, 
    then updates the fight_info dictionary.

    Parameters:
    - fight_info (dict): Dictionary containing basic information about the fight and links.
    """
    fight_url = fight_info['fight_link']
    fight_soup = get_page_content(fight_url)

    if not fight_soup:
        print(f"Failed to retrieve the fight page for {fight_info['fighter_a']['name']} vs {fight_info['fighter_b']['name']}. Skipping...")
        return

    weight_class_element = fight_soup.find('i', class_='b-fight-details__fight-title')
    weight_class = weight_class_element.text.strip() if weight_class_element else 'N/A'

    # Updating fight_info with the obtained data
    fight_info.update({
        'weight_class': weight_class,
        'title_fight': 'title' in weight_class.lower(),
        'gender': 'women' if 'women' in weight_class.lower() else 'men',
        'method_of_victory': fight_info.get('method_of_victory', 'N/A'),
        'round_of_victory': 'N/A',
        'time_in_ROV': 'N/A',
        'time_format': 'N/A',
        'referee': 'N/A'
    })

    if fight_info['method_of_victory'] != 'NC':  # Only update if not a 'No Contest'
        extract_victory_and_round_data(fight_soup, fight_info)

    # Parsing round information and strikes data
    rows_fighter_data = fight_soup.select('.b-fight-details__section.js-fight-section')[2].find('table', class_='b-fight-details__table').select('tbody > tr.b-fight-details__table-row')
    rows_strikes_data = fight_soup.select('.b-fight-details__section.js-fight-section')[4].find('table', class_='b-fight-details__table').select('tbody > tr.b-fight-details__table-row')

    for i, (row_fighter_data, row_strikes_data) in enumerate(zip(rows_fighter_data, rows_strikes_data), start=1):
        extract_round_data(row_fighter_data, row_strikes_data, fight_info['fighter_a']['name'], fight_info['fighter_b']['name'], fight_info)

    print(f"\nFight Summary for {fight_info['fighter_a']['name']} vs {fight_info['fighter_b']['name']}:")
    print(f"Winner: {fight_info['winner']}")
    print(f"Method of Victory: {fight_info['method_of_victory']}")
    print(f"Round of Victory: {fight_info['round_of_victory']}")
    print(f"Time of Victory: {fight_info['time_in_ROV']}")
    print(f"Time Format: {fight_info['time_format']}")
    print(f"Weight Class: {fight_info['weight_class']}")
    print(f"Referee: {fight_info['referee']}")
    print(f"Title Fight: {fight_info['title_fight']}")
    print(f"Gender: {fight_info['gender']}")
    print(f"Fight Link: {fight_info['fight_link']}")

    # Print rounds information for both fighters
    print("\nRounds Information:")
    for i, round_data in enumerate(fight_info['fighter_a']['rounds_a'], start=1):
        print(f"\nRound {i}:")
        print(f"Fighter A: {round_data}")
        print(f"\nFighter B: {fight_info['fighter_b']['rounds_b'][i-1]}")

def scrape_fighter_info(fighter_info):
    """
    Scrapes and stores individual fighter details such as height, reach, and date of birth from the fighter's page.

    Parameters:
    - fighter_info (dict): Dictionary containing the fighter's name and link.
    """
    fighter_url = fighter_info['link']
    fighter_soup = get_page_content(fighter_url)

    if not fighter_soup:
        print(f"Failed to retrieve the fighter page for {fighter_info['name']}. Skipping...")
        return

    details_list = fighter_soup.find('ul', class_='b-list__box-list')
    details_items = details_list.find_all('li', class_='b-list__box-list-item')

    height_info = None
    reach_info = None
    dob_info = None

    for item in details_items:
        title = item.find('i', class_='b-list__box-item-title')
        if title:
            if 'Height' in title.text:
                height_info = item.contents[-1].strip()
            elif 'Reach' in title.text:
                reach_info = item.contents[-1].strip()
            elif 'DOB' in title.text:
                dob_info = item.contents[-1].strip()

    if height_info:
        fighter_info['height'] = {'feet_inches': height_info, 'inches': convert_to_inches(height_info)['inches']}
    else:
        fighter_info['height'] = {'feet_inches': 'N/A', 'inches': 'N/A'}

    if reach_info:
        fighter_info['reach'] = reach_info
    else:
        fighter_info['reach'] = 'N/A'

    if dob_info:
        dob_data = convert_to_numerical_date(dob_info)
        fighter_info['dob'] = dob_data['numerical_dob']
    else:
        fighter_info['dob'] = 'N/A'

    print(f"\nFighter Info for {fighter_info['name']}:")
    print(f"Height: {fighter_info['height']['feet_inches']}")
    print(f"Reach: {fighter_info['reach']}")
    print(f"Date of Birth: {fighter_info['dob']}")
    print(f"Fighter Link: {fighter_info['link']}")

def write_to_csv(events_list):
    """
    Writes collected event data to a CSV file, including detailed round statistics for each fight.

    Parameters:
    - events_list (list): List of dictionaries, each representing an event and its associated fight data.
    """
    headers = [
        "event_name", "event_date", "event_location", "winner",
        "fighter_a_name", "fighter_b_name", "weightclass", "method_of_victory",
        "round_of_victory", "time_of_victory", "time_format", "referee",
        "title_fight", "gender", "fight_link", "fighter_a_height", "fighter_a_reach", "fighter_a_dob",
        "fighter_b_height", "fighter_b_reach", "fighter_b_dob"
    ]
    
    # Extend headers with round details for both fighters across five rounds
    rounds = 5
    for round_num in range(1, rounds + 1):
        for side in ['a', 'b']:
            prefix = f'rnd_{round_num}_{side}'
            headers.extend([
                f"{prefix}_knockdowns", f"{prefix}_sig_strikes", f"{prefix}_total_strikes",
                f"{prefix}_takedowns", f"{prefix}_sub_attemps", f"{prefix}_reversals",
                f"{prefix}_control_time", f"{prefix}_head_strikes", f"{prefix}_body_strikes",
                f"{prefix}_leg_strikes", f"{prefix}_distance_strikes", f"{prefix}_clinch_strikes",
                f"{prefix}_ground_strikes"
            ])

    with open('event_masterlist.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)

        for event in events_list:
            for fight in event['fights']:
                row = [
                    event['name'], event['date'], event['location'], fight['winner'],
                    fight['fighter_a']['name'], fight['fighter_b']['name'],
                    fight['weight_class'], fight['method_of_victory'],
                    fight['round_of_victory'], fight['time_in_ROV'], fight['time_format'],
                    fight['referee'], fight['title_fight'], fight['gender'],
                    fight['fight_link'], fight['fighter_a'].get('height', {}).get('inches', 'NaN'),
                    fight['fighter_a'].get('reach', 'NaN'), fight['fighter_a'].get('dob', 'NaN'),
                    fight['fighter_b'].get('height', {}).get('inches', 'NaN'),
                    fight['fighter_b'].get('reach', 'NaN'),
                    fight['fighter_b'].get('dob', 'NaN')
                ]
                
                # Adding round details for each fighter
                for round_num in range(1, rounds + 1):
                    for side in ['a', 'b']:
                        round_data = fight.get(f'fighter_{side}').get(f'rounds_{side}', [])
                        if len(round_data) >= round_num:
                            round_stats = round_data[round_num - 1]
                            row.extend([
                                round_stats.get('knockdowns', 'NaN'),
                                round_stats.get('sig_strikes', 'NaN'),
                                round_stats.get('total_strikes', 'NaN'),
                                round_stats.get('takedowns', 'NaN'),
                                round_stats.get('sub_attempts', 'NaN'),
                                round_stats.get('reversals', 'NaN'),
                                round_stats.get('control_time', 'NaN'),
                                round_stats.get('head_strikes', 'NaN'),
                                round_stats.get('body_strikes', 'NaN'),
                                round_stats.get('leg_strikes', 'NaN'),
                                round_stats.get('distance_strikes', 'NaN'),
                                round_stats.get('clinch_strikes', 'NaN'),
                                round_stats.get('ground_strikes', 'NaN')
                            ])
                        else:
                            row.extend(['NaN'] * 13)  # Add NaN for all round stats if round data is missing

                writer.writerow(row)

def extract_event_info(event_element):
    """
    Extracts basic information for a given event element including name, link, date, and location.
    
    Parameters:
    - event_element (bs4.element.Tag): The BeautifulSoup tag for the event.

    Returns:
    - dict: A dictionary containing event name, link, formatted date, location, and an empty list for fights.
    """
    event_name = event_element.text.strip()
    event_link = event_element['href']
    
    # Extract date and location information
    event_date_str = event_element.find_next('span', class_='b-statistics__date').text.strip()
    event_location_str = event_element.find_next('td', class_='b-statistics__table-col b-statistics__table-col_style_big-top-padding').text.strip()
    
    # Format the date
    formatted_date = parse_date(event_date_str)
    
    # Create the event info dictionary
    event_info = {
        'name': event_name,
        'link': event_link,
        'date': formatted_date,
        'location': event_location_str,
        'fights': []  # Empty list to store fight information
    }
    
    # Retrieve event page content
    event_soup = get_page_content(event_info['link'])
    
    return event_info, event_soup

def extract_fight_info(fight_row):
    """
    Extracts information about a specific fight, including fighter details, fight link, winner, and method of victory.
    
    Parameters:
    - fight_row (bs4.element.Tag): The BeautifulSoup tag containing the row with fight details.

    Returns:
    - dict: A dictionary containing the fight link, winner, method of victory, and fighter details.
    """
    # Extract fighter information
    fighter_a, fighter_b = extract_fighter_info(fight_row)
    
    # Check for fight link and winner determination
    fight_link, winner, method_of_victory = get_fight_link_and_winner(fight_row)
    
    return {
        'fight_link': fight_link,
        'winner': winner,
        'method_of_victory': method_of_victory,
        'fighter_a': fighter_a,
        'fighter_b': fighter_b
    }

def main():
    """
    Main function to orchestrate the web scraping process, including retrieving event data,
    scraping fight details and individual fighter stats, and finally writing all data to a CSV file.
    
    Returns:
    - list: List of dictionaries with detailed event and fight information (used for writing to CSV).
    """
    try:
        # URL of the main page listing all completed events
        url = 'http://www.ufcstats.com/statistics/events/completed?page=all'
        soup = get_page_content(url)

        # Check if the main page content was successfully retrieved
        if soup:
            event_elements = soup.find_all('a', class_='b-link b-link_style_black')
            events_list = []

            # Iterate over each event link found on the main page
            for event_element in event_elements:
                event_info, event_soup = extract_event_info(event_element)

                # Check if the event page content was successfully retrieved
                if event_soup:
                    fight_rows = event_soup.find_all('tr', class_='b-fight-details__table-row')[1:]  # Skip the header row which contains a future event

                    # Iterate over fight rows and extract fight information
                    for fight_row in fight_rows:
                        fight_info = extract_fight_info(fight_row)
                        event_info['fights'].append(fight_info)

                    events_list.append(event_info)

            # Process each event and associated fights to scrape detailed information
            for event_info in events_list:
                print(f"\nEvent: {event_info['name']}")
                print(f"Date: {event_info['date']}")
                print(f"Location: {event_info['location']}")

                for fight_info in event_info['fights']:
                    scrape_fight_info(fight_info)
                    scrape_fighter_info(fight_info['fighter_a'])
                    scrape_fighter_info(fight_info['fighter_b'])

        # Return the populated event list, useful for unit testing or further processing
        return events_list

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Always attempt to save data to CSV, even if an error occurs
        if events_list:
            print("Saving data to CSV due to an error or normal completion.")
            write_to_csv(events_list)

if __name__ == "__main__":
    # Script entry point, calls the main function to start the process
    events_list = main()
    write_to_csv(events_list)  # Ensure data is saved at the end of processing            

    
# Small clerical note:
# I'm not sure why, but there is a delay in start time. You must run the cell and then wait, sometimes upwards
# of 5 minutes, for the scraping to begin. I couldn't figure out how to fix this, but it does
# work. I promise. You just have to be patient. I apologize for any inconvenience.