In [123]:
# Functional -- 4/11/2024
import requests
from bs4 import BeautifulSoup
import re

# The base URL for UFC results
base_url = "https://www.ufc.com"
results_url = base_url + "/results?language_content_entity=en"

def extract_url_text_to_compare_previous_link(url):
    """
    Extracts the unique identifier from a URL, which is the part between 'ufc' and 'vs'.
    If the URL does not follow the expected format, return None.
    """
    match = re.search(r'ufc-(.*?)-vs', url)
    if match:
        return match.group(1)
    
    print("No unique identifier found")
    return None


def extract_unique_identifier(url):
    """
    Extracts a unique identifier from a UFC event URL, using the part after '/news/'.
    This helps in capturing a broader range of URLs.
    """
    match = re.search(r'/news/(.*)', url)
    if match:
        return match.group(1)
    
    print("No unique identifier found")
    return None


def find_results_links(url):
    results_links = []  # Initialize a new list for results links
    last_fight_info = None
    
    while url:
        print("Processing:", url)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        for link in soup.find_all("a", class_="c-card--grid-card-trending"):
            link_text = link.text.lower()
            link_href = link.get('href')
            if "results" in link_text and "weigh" not in link_text and "bonus" not in link_text and "episode" not in link_text and "season" not in link_text:
                current_fight_info = extract_url_text_to_compare_previous_link(link_href)
                if current_fight_info != last_fight_info:
                    full_link = base_url + link_href
                    results_links.append(full_link)
                    last_fight_info = current_fight_info
        
        load_more_button = soup.find("a", title="Load more items")
        if load_more_button:
            next_page = load_more_button.get('href')
            if next_page.startswith("?"):
                url = base_url + "/results" + next_page
            else:
                url = next_page
        else:
            url = None
            
    return results_links


def find_event_scorecard_links(results_links):
    event_scorecard_links = []
    last_identifier = None  # Track the last unique identifier
    
    for url in results_links:
        print("Processing:", url)
        # Use the new approach to extract a unique identifier
        current_identifier = extract_unique_identifier(url)
        if current_identifier != last_identifier:
            if "official-judges-scorecard" in url or "official-scorecard" in url:
                print("result_link same as scorecard_link:", url)
                event_scorecard_links.append(url)
                last_identifier = current_identifier
            else:
                print("Visiting result_link:", url)
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')

                scorecard_link_element = soup.find("a", href=lambda href: href and "official" in href and "scorecard" in href)
                if scorecard_link_element:
                    scorecard_link = scorecard_link_element.get('href')
                    formatted_scorecard_link = scorecard_link if scorecard_link.startswith("http") else base_url + scorecard_link
                    
                    # Re-extract the identifier for a possibly different URL structure
                    new_identifier = extract_unique_identifier(formatted_scorecard_link)
                    if new_identifier != last_identifier:
                        event_scorecard_links.append(formatted_scorecard_link)
                        last_identifier = new_identifier
    
    return event_scorecard_links


def remove_redundant_links(event_scorecard_links):
    # Initialize a list to keep track of links that will be removed
    removed_event_scorecard_links = []
    
    # Initialize a new list to store the filtered links
    filtered_links = []
    
    # Use a set for efficient lookup of removed links
    links_to_remove = set()
    
    # First pass to remove links with 'search?query=official+scorecards'
    for link in event_scorecard_links:
        if 'search?query=official+scorecards' in link:
            links_to_remove.add(link)
    
    # Compare each link against all others to find redundancies
    for i, link_i in enumerate(event_scorecard_links):
        for j, link_j in enumerate(event_scorecard_links):
            # Skip comparing the link with itself
            if i == j:
                continue
            
            # If link_i is a substring of link_j, mark link_j for removal
            if link_i in link_j and link_j not in links_to_remove:
                links_to_remove.add(link_j)
    
    # Separate the links into filtered and removed based on the set
    for link in event_scorecard_links:
        if link in links_to_remove:
            removed_event_scorecard_links.append(link)
        else:
            filtered_links.append(link)
    
    return filtered_links, removed_event_scorecard_links


def add_manual_event_scorecard_links(existing_links, manual_links):
    existing_set = set(existing_links)
    combined_links = list(existing_links)
    for link in manual_links:
        if link not in existing_set:
            combined_links.append(link)
            existing_set.add(link)
    return combined_links

# Manual links identified that were missed by the scraping process
manual_event_scorecard_links = [
    "https://www.ufc.com/news/official-judges-scorecards-ufc-fight-night-ankalaev-vs-walker-2",
    "https://www.ufc.com/news/official-scorecards-ufc-fight-night-ortega-vs-rodriguez",
    "https://www.ufc.com/news/official-judges-scorecards-ufc-269-oliveira-vs-poirier-nunes-pena?language_content_entity=en",
    "https://www.ufc.com/news/official-scorecards-ufc-fight-night-sandhagen-vs-dillashaw",
    "https://www.ufc.com/news/ufc-fight-night-brunson-vs-holland-scorecards-gillespie-riddell-winners-ufc-vegas-22?language_content_entity=en",
    "https://www.ufc.com/news/ufc-257-official-scorecards-poirier-mcgregor-2-fight-island",
    "https://www.ufc.com/news/ufc-fight-island-6-official-scorecards-ortega-korean-zombie-results",
    "https://www.ufc.com/news/ufc-vegas-9-official-scorecards-sakai-overeem",
    "https://www.ufc.com/news/ufc-vegas-8-official-scorecards-fight-night-smith-rakic?language_content_entity=en",
    "https://www.ufc.com/news/ufc-vegas-7-official-scorecards-munhoz-edgar",
]



# Start the process
results_links = find_results_links(results_url)
event_scorecard_links = find_event_scorecard_links(results_links)

# Remove redundant links and capture the removed links
event_scorecard_links, removed_event_scorecard_links = remove_redundant_links(event_scorecard_links)

# Add manually identified event scorecard links
event_scorecard_links = add_manual_event_scorecard_links(event_scorecard_links, manual_event_scorecard_links)

# Print out all found event scorecard links
print("\nFound event scorecard links:")
for link in event_scorecard_links:
    print(link)
    
    
# Print out the list of removed event scorecard links
print("\nRemoved event scorecard links:")
for removed_link in removed_event_scorecard_links:
    print(removed_link)
    

# Print out the list of removed event scorecard links
print("\nManually added event scorecard links:")
for manual_link in manual_event_scorecard_links:
    print(manual_link)

Processing: https://www.ufc.com/results?language_content_entity=en
No unique identifier found
No unique identifier found
Processing: https://www.ufc.com/results?language_content_entity=en&page=1
Processing: https://www.ufc.com/results?language_content_entity=en&page=2
No unique identifier found
No unique identifier found
Processing: https://www.ufc.com/results?language_content_entity=en&page=3
Processing: https://www.ufc.com/results?language_content_entity=en&page=4
No unique identifier found
Processing: https://www.ufc.com/results?language_content_entity=en&page=5
Processing: https://www.ufc.com/results?language_content_entity=en&page=6
Processing: https://www.ufc.com/results?language_content_entity=en&page=7
Processing: https://www.ufc.com/results?language_content_entity=en&page=8
Processing: https://www.ufc.com/results?language_content_entity=en&page=9
Processing: https://www.ufc.com/results?language_content_entity=en&page=10
Processing: https://www.ufc.com/results?language_content_

In [None]:
# Functions properly 4.11.2024
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib.request import urlretrieve
import img2pdf
import re

def parse_folder_name_from_event_scorecard_link(event_scorecard_link):
    """
    Parses the folder name from an event_scorecard_link.
    """
    match = re.search(r'news/(.*?)(\?|results|$)', event_scorecard_link)
    if match:
        # Replace '-' with '_' for file system compatibility
        folder_name = match.group(1).replace('-', '_').strip()
        # Remove or replace characters that are invalid in file names
        folder_name = re.sub(r'[\\/*?:"<>|]', '', folder_name)
        return folder_name
    else:
        return "No_Folder_Name_Parsed"

def download_images_from_event_scorecard_link_and_create_pdf(event_scorecard_link, event_index):
    """
    Downloads images from a given event scorecard link and saves them into a PDF.
    """
    parsed_folder_name = parse_folder_name_from_event_scorecard_link(event_scorecard_link)
    save_dir = f"ufc_images/event_{event_index}_{parsed_folder_name}"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    response = requests.get(event_scorecard_link)
    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img', class_="image-style-inline")
    images = [img for img in img_tags if "dmxg5wxfqgb4u.cloudfront.net" in img.get('src', '')]

    image_paths = []

    for img in images:
        img_url = urljoin(event_scorecard_link, img['src'])
        clean_filename = os.path.basename(urlparse(img_url).path)
        filename = os.path.join(save_dir, clean_filename)

        try:
            urlretrieve(img_url, filename)
            print(f"Downloaded {filename}")
            image_paths.append(filename)
        except Exception as e:
            print(f"Could not download {img_url}. Reason: {e}")

    # Corrected snippet from download_images_from_event_scorecard_link_and_create_pdf function
    if image_paths:
        # Construct the PDF filename correctly without repeating the directory structure in the name
        pdf_filename = os.path.join(save_dir, f"{event_index}_{parsed_folder_name}.pdf")
        try:
            with open(pdf_filename, "wb") as f:
                f.write(img2pdf.convert(image_paths))
            print(f"Created PDF: {pdf_filename}")
        except Exception as e:
            print(f"Failed to create PDF: {pdf_filename}. Error: {e}")


        
starting_event_number = 123
ending_event_number = 124

# Adjust the slicing to include both starting and ending conditions
# Note: The ending index in slicing is exclusive, hence no need to subtract 1 from ending_event_number
for index, event_scorecard_link in enumerate(event_scorecard_links[starting_event_number - 1:ending_event_number], start=starting_event_number):
    print(f"Processing images from: {event_scorecard_link}")
    download_images_from_event_scorecard_link_and_create_pdf(event_scorecard_link, index)

In [132]:
# Simple function to determine if any pdf's were not properly compiled and put into the appropriate folders
import os

def find_folders_without_pdfs(parent_directory):
    """
    Finds and returns a list of folder names within the specified parent directory
    that do not contain any PDF files.

    Args:
    - parent_directory (str): The path to the parent directory to search within.

    Returns:
    - list: A list of folder names without any PDF files.
    """
    folders_without_pdfs = []

    # Walk through all directories and subdirectories of the parent directory
    for root, dirs, files in os.walk(parent_directory):
        # Check if there are any PDF files in the current directory
        if not any(file.endswith('.pdf') for file in files):
            # If there are no PDF files, add the folder name to the list
            # The folder name is the last part of the path in 'root'
            folder_name = os.path.basename(root)
            folders_without_pdfs.append(folder_name)

    return folders_without_pdfs

# Specify the path to the parent directory
parent_directory = r"C:\Users\EditZ\ufc_images"

# Find folders without PDFs
folders_without_pdfs = find_folders_without_pdfs(parent_directory)

# Print the list of folders without PDFs
print("Folders without PDF files:")
for folder in folders_without_pdfs:
    print(folder)


Folders without PDF files:
ufc_images
event_123_ufc_fight_night_whittaker_vs_gastelum_official_scorecards_stephens_klose_arlovski_sherman_espn_plus_ufc_vegas_24
event_124_ufc_fight_night_vettori_vs_holland_official_scorecards_till_allen_yusuff_nunes_dern_abc_espn_plus_ufc_vegas_23


In [136]:
# There was an issue with the automated pdf creation within several folders
# Here's a simple program to do it manually
def create_pdf_from_images(folder_path):
    """
    Creates a PDF file from all JPEG images in the specified folder.
    """
    if not os.path.isdir(folder_path):
        print(f"The folder {folder_path} does not exist.")
        return

    image_files = [f for f in os.listdir(folder_path) if f.endswith('.jpg') or f.endswith('.jpeg')]
    image_files.sort()

    # Using a simpler name for the PDF file to avoid long paths
    pdf_filename = os.path.join(folder_path, "compiled_images.pdf")

    if image_files:
        image_paths = [os.path.join(folder_path, f) for f in image_files]
        try:
            with open(pdf_filename, "wb") as f:
                f.write(img2pdf.convert(image_paths))
            print(f"PDF created successfully: {pdf_filename}")
        except Exception as e:
            print(f"Failed to create PDF from images in {folder_path}. Error: {e}")
    else:
        print(f"No JPEG images found in {folder_path}.")

folders_to_process = [
    r"C:\Users\EditZ\ufc_images\event_123_ufc_fight_night_whittaker_vs_gastelum_official_scorecards_stephens_klose_arlovski_sherman_espn_plus_ufc_vegas_24",
    r"C:\Users\EditZ\ufc_images\event_124_ufc_fight_night_vettori_vs_holland_official_scorecards_till_allen_yusuff_nunes_dern_abc_espn_plus_ufc_vegas_23"
]

for folder in folders_to_process:
    create_pdf_from_images(folder)


PDF created successfully: C:\Users\EditZ\ufc_images\event_123_ufc_fight_night_whittaker_vs_gastelum_official_scorecards_stephens_klose_arlovski_sherman_espn_plus_ufc_vegas_24\compiled_images.pdf
PDF created successfully: C:\Users\EditZ\ufc_images\event_124_ufc_fight_night_vettori_vs_holland_official_scorecards_till_allen_yusuff_nunes_dern_abc_espn_plus_ufc_vegas_23\compiled_images.pdf
