In [3]:
import bs4 as bs
import requests
import pandas as pd

In [4]:
url = "https://results.ittf.link/index.php/events/list/27?resetfilters=0&clearordering=0&clearfilters=0"

In [5]:
events_page_soup= bs.BeautifulSoup(requests.get(url).text)

In [8]:
a = events_page_soup.find("id", "vw_tournaments___yrvalue")
a

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_ittf_events_by_year(start_year: int, end_year: int) -> pd.DataFrame:
    """
    Scrapes the ITTF events page for every year, posting the year as a filter.
    
    Returns: A DataFrame containing event data from all scraped years.
    """
    
    BASE_URL = 'https://results.ittf.link/index.php/events/list/27?resetfilters=0&clearordering=0&clearfilters=0'
    all_events_data = []
    
    # 1. Use a requests.Session to maintain state (cookies, headers)
    with requests.Session() as session:
        # Initial GET request to establish session and get necessary tokens/cookies
        print("Initializing session to get baseline data...")
        initial_response = session.get(BASE_URL, timeout=20)
        
        # 2. Extract all years from the HTML snippet you provided
        # Since you provided the HTML, we can hardcode the years for simplicity
        years_to_scrape = list(range(start_year, end_year + 1))
        
        # 3. Loop through each year and send the filtering request
        for year in years_to_scrape:
            print(f"--- üü¢ Scraping events for year: {year} üü¢ ---")
            
            # The POST data payload is based on the form field names
            # The structure is specific to the "Fabrik" CMS this site uses
            post_data = {
                'fabrik___filter[list_27_com_fabrik_27][value][0]': str(year),
                'list_27_com_fabrik_27_submit': '1', # A common key used to submit the form
                'task': 'list.filter' # The task performed when filtering
            }
            
            # Send the POST request to filter the list
            try:
                response = session.post(BASE_URL, data=post_data, timeout=30)
                response.raise_for_status()
            except requests.RequestException as e:
                print(f"‚ùå Error fetching {year}: {e}. Skipping.")
                continue

            # 4. Parse the resulting HTML page
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the main table element by its ID or class
            # You will need to inspect the page source to find the correct table selector (e.g., table with id="list_27")
            event_rows = soup.find_all('tr', class_='fabrik_row') 

            if not event_rows:
                 # Check for known errors or empty results
                 print(f"   ‚ö†Ô∏è No event data or table rows found for {year}.")
                 continue
            
            # 5. Extract data from each row
            for row in event_rows:
                columns = row.find_all(['td'])
                if len(columns) > 5: # Assuming at least 5 columns of relevant data
                    try:
                        all_events_data.append({
                            'Year': year,
                            'EventName': columns[1].text.strip(), # Example index; adjust based on inspection
                            'Country': columns[2].text.strip(),
                            'StartDate': columns[3].text.strip(),
                            'EndDate': columns[4].text.strip(),
                            # To get the link/ID, you need to find the <a> tag inside a cell:
                            'EventLink': columns[1].find('a')['href'] if columns[1].find('a') else None
                        })
                    except Exception as e:
                        print(f"   ‚ùå Error parsing row in {year}: {e}")
                        continue
            
            # Be polite: pause between years
            time.sleep(1.0) 

    # 6. Final DataFrame creation
    if all_events_data:
        return pd.DataFrame(all_events_data)
    else:
        return pd.DataFrame()

# --- EXECUTION ---
# Get data from 1988 up to the current year (2025)
df_all_ittf_events = scrape_ittf_events_by_year(1988, 2025)

if not df_all_ittf_events.empty:
    print(f"\n‚úÖ Total unique events collected: {len(df_all_ittf_events)}.")
    # Display the first few results
    display(df_all_ittf_events.head())
    # Save the master list
    df_all_ittf_events.to_csv('master_ittf_events_1988_2025.csv', index=False)

Initializing session to get baseline data...
--- üü¢ Scraping events for year: 1988 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1988.
--- üü¢ Scraping events for year: 1989 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1989.
--- üü¢ Scraping events for year: 1990 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1990.
--- üü¢ Scraping events for year: 1991 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1991.
--- üü¢ Scraping events for year: 1992 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1992.
--- üü¢ Scraping events for year: 1993 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1993.
--- üü¢ Scraping events for year: 1994 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1994.
--- üü¢ Scraping events for year: 1995 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1995.
--- üü¢ Scraping events for year: 1996 üü¢ ---
   ‚ö†Ô∏è No event data or table rows found for 1996.
--- üü¢ Scraping events for

In [11]:
import requests
import time

def check_ittf_html_content(start_year: int, end_year: int):
    """
    Loops through years, sends POST request to filter, and prints the response status and content snippet.
    """
    
    BASE_URL = 'https://results.ittf.link/index.php/events/list/27?resetfilters=0&clearordering=0&clearfilters=0'
    years_to_scrape = list(range(start_year, end_year + 1))
    
    # 1. Use a requests.Session to handle cookies and state
    with requests.Session() as session:
        print("Initializing session...")
        
        # Initial GET request (optional, but good practice to establish session)
        session.get(BASE_URL, timeout=20)
        
        # 2. Loop through each year and send the POST request
        for year in years_to_scrape:
            
            # The POST data payload is based on the form field names
            post_data = {
                'fabrik___filter[list_27_com_fabrik_27][value][0]': str(year),
                'list_27_com_fabrik_27_submit': '1', 
                'task': 'list.filter'
            }
            
            try:
                response = session.post(BASE_URL, data=post_data, timeout=30)
                
                # Check status and print
                if response.status_code == 200:
                    print(f"‚úÖ [{year}] Status: 200 OK. Content Snippet:")
                    # Print the first 500 characters of the HTML to confirm data exists
                    print(response.text[:500].replace('\n', ' ') + '...')
                else:
                    print(f"‚ùå [{year}] Status: {response.status_code}. Request failed.")
                
            except requests.RequestException as e:
                print(f"‚ùå [{year}] Connection Error: {e}")
            
            # Be polite: pause between years
            time.sleep(1.5) 
            
# --- EXECUTION ---
# This will start your scrape from 1988 to 2025
check_ittf_html_content(1988, 2025)

Initializing session...
‚úÖ [1988] Status: 200 OK. Content Snippet:
<!DOCTYPE html> <html lang="en-gb" dir="ltr">  <head>     <meta charset="utf-8"> 	<meta name="viewport" content="width=device-width, initial-scale=1"> 	<meta name="description" content="ITTF results, WTT results, statistics, head to head, players matches, players profiles, world ranking, historical data, and table tennis analytics."> 	<title>Events</title> 	<link href="/manifest.json" rel="manifest">      <link href="/media/system/css/joomla-fontawesome.min.css?24b84c" rel="lazy-stylesheet" nonc...
‚úÖ [1989] Status: 200 OK. Content Snippet:
<!DOCTYPE html> <html lang="en-gb" dir="ltr">  <head>     <meta charset="utf-8"> 	<meta name="viewport" content="width=device-width, initial-scale=1"> 	<meta name="description" content="ITTF results, WTT results, statistics, head to head, players matches, players profiles, world ranking, historical data, and table tennis analytics."> 	<title>Events</title> 	<link href="/manifest.js