# Gather Historic Data
This notebook is, for now, more of experimental work. I want to gather multiple sources of data - preferably automate this process. For starters I want to gather historical data since they are more of a one time job.

## Start with all results from the first BL

I found multiple websites to gather. I took this one that is the simplest to scrap. Since no `robot.txt` is present I dont think this is a problem, because I only want to gather this one time only.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# List to store scraped data
data = []

# Iterate over the pages (from 101 to 161 inclusive)
for i in range(101, 162):
    url = f"http://www.bulibox.de/spieltage/B100{i}.html"
    print(f"Scraping: {url}")
    response = requests.get(url)
    
    # Check if the page was retrieved successfully
    if response.status_code != 200:
        print(f"Error retrieving {url}")
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    
    # Get season info from the <h4> tag (if available)
    h4_tag = soup.find("h4")
    season = h4_tag.text.strip() if h4_tag else "Unknown Season"
    
    # Find all matchday markers (<b class="bulired">)
    matchday_tags = soup.find_all("b", class_="bulired")
    
    for matchday_tag in matchday_tags:
        matchday = matchday_tag.text.strip()
        # Get the table that immediately follows this matchday header
        table = matchday_tag.find_next("table")
        if table:
            rows = table.find_all("tr")[1:]  # skip header row
            for row in rows:
                cells = row.find_all("td")
                if len(cells) >= 3:
                    spielpaarung = cells[0].get_text(strip=True)
                    ergebnis = cells[1].get_text(strip=True)
                    datum = cells[2].get_text(strip=True)
                    
                    data.append({
                        "Season": season,
                        "Spieltag": matchday,
                        "Spielpaarung": spielpaarung,
                        "Ergebnis": ergebnis,
                        "Datum": datum
                    })
    
    # Wait 5 seconds before the next request to be compliant with the site's usage
    time.sleep(5)

# Create a DataFrame from the collected data
df = pd.DataFrame(data)


Scraping: http://www.bulibox.de/spieltage/B100101.html
Scraping: http://www.bulibox.de/spieltage/B100102.html
Scraping: http://www.bulibox.de/spieltage/B100103.html
Scraping: http://www.bulibox.de/spieltage/B100104.html
Scraping: http://www.bulibox.de/spieltage/B100105.html
Scraping: http://www.bulibox.de/spieltage/B100106.html
Scraping: http://www.bulibox.de/spieltage/B100107.html
Scraping: http://www.bulibox.de/spieltage/B100108.html
Scraping: http://www.bulibox.de/spieltage/B100109.html
Scraping: http://www.bulibox.de/spieltage/B100110.html
Scraping: http://www.bulibox.de/spieltage/B100111.html
Scraping: http://www.bulibox.de/spieltage/B100112.html
Scraping: http://www.bulibox.de/spieltage/B100113.html
Scraping: http://www.bulibox.de/spieltage/B100114.html
Scraping: http://www.bulibox.de/spieltage/B100115.html
Scraping: http://www.bulibox.de/spieltage/B100116.html
Scraping: http://www.bulibox.de/spieltage/B100117.html
Scraping: http://www.bulibox.de/spieltage/B100118.html
Scraping: 

ValueError: Columns must be same length as key

In [7]:
df.to_csv("first_bl_results.csv")

## Gather all tables


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Directory to store CSV files
output_dir = "data/abschlusstabellen"
os.makedirs(output_dir, exist_ok=True)

# Iterate over pages from 101 to 161 inclusive
for i in range(101, 162):
    url = f"http://www.bulibox.de/abschlusstabellen/B100{i}.html"
    print(f"Scraping: {url}")
    
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error retrieving {url}")
        continue
    
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="abschluss")
    if not table:
        print(f"No table found at {url}")
        continue

    # Parse table header
    header_row = table.find("tr")
    headers = [th.get_text(strip=True) for th in header_row.find_all("th")]
    
    # Parse table rows
    rows = []
    for tr in table.find_all("tr")[1:]:
        cells = tr.find_all("td")
        row = []
        for cell in cells:
            # If the cell contains a link, get its text; otherwise, use cell text
            a_tag = cell.find("a")
            cell_text = a_tag.get_text(strip=True) if a_tag else cell.get_text(strip=True)
            row.append(cell_text)
        rows.append(row)
    
    # Create DataFrame from the table data
    df_table = pd.DataFrame(rows, columns=headers)
    
    # Extract season from the "Statistik" column of the first row, e.g., "Saison 2023/2024"
    if "Statistik" in df_table.columns and not df_table.empty:
        season_text = df_table.loc[0, "Statistik"]
        season = season_text.replace("Saison", "").strip()
        # Replace slashes with dashes to avoid directory issues (e.g., "2023/2024" -> "2023-2024")
        season = season.replace("/", "-")
    else:
        season = f"season_{i}"
    
    # Build output path and save DataFrame as CSV
    output_path = os.path.join(output_dir, f"{season}.csv")
    df_table.to_csv(output_path, index=False)
    print(f"Saved table for season {season} to {output_path}")
    
    # Wait 5 seconds before the next request
    time.sleep(5)


Scraping: http://www.bulibox.de/abschlusstabellen/B100101.html
Saved table for season 1963-1964 to data/abschlusstabellen\1963-1964.csv
Scraping: http://www.bulibox.de/abschlusstabellen/B100102.html
Saved table for season 1964-1965 to data/abschlusstabellen\1964-1965.csv
Scraping: http://www.bulibox.de/abschlusstabellen/B100103.html
Saved table for season 1965-1966 to data/abschlusstabellen\1965-1966.csv
Scraping: http://www.bulibox.de/abschlusstabellen/B100104.html
Saved table for season 1966-1967 to data/abschlusstabellen\1966-1967.csv
Scraping: http://www.bulibox.de/abschlusstabellen/B100105.html
Saved table for season 1967-1968 to data/abschlusstabellen\1967-1968.csv
Scraping: http://www.bulibox.de/abschlusstabellen/B100106.html
Saved table for season 1968-1969 to data/abschlusstabellen\1968-1969.csv
Scraping: http://www.bulibox.de/abschlusstabellen/B100107.html
Saved table for season 1969-1970 to data/abschlusstabellen\1969-1970.csv
Scraping: http://www.bulibox.de/abschlusstabelle