# Scrape National League results from Soccerbase (2015/16-2017/18)

In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36"
    }
season_ids = ["145", "149", "150"]

results = []
for season_id in season_ids:
    url = f"https://www.soccerbase.com/teams/results.sd?season_id={season_id}&team_id=2598&teamTabs=results&_=1660927413898"
    
    r = requests.get(url)
    doc = BeautifulSoup(r.content)

    season_years = doc.select_one(".seasonSelector h3").text

    matches = doc.select(".soccerGrid tbody tr.match")
    
    for match in matches:
        columns = match.select("td")

        competition = match.select_one(".tournament span").previousSibling.text.strip()
        
        game_date = match.select_one(".dateTime a")['href'].split("=")[1]

        home_team = match.select_one(".homeTeam a").text

        away_team = match.select_one(".awayTeam a").text
            
        goals = match.select(".score em")
        
        home_goals = int(goals[0].text)
        away_goals = int(goals[1].text)

        if home_team == "Tranmere":
            home_team = "Tranmere Rovers"
            opposition = away_team
            venue = "H"
            goals_for = home_goals
            goals_against = away_goals
            stadium = "Prenton Park"
        else:
            away_team = "Tranmere Rovers"
            opposition = home_team
            venue = "A"
            goals_for = away_goals
            goals_against = home_goals
            stadium = ""

        score = f"{goals_for}-{goals_against}"

        if goals_for > goals_against:
            outcome = "W"
        elif goals_for == goals_against:
            outcome = "D"
        else:
            outcome = "L"
        
        neutral_venue = match.select_one(".neutralVenues")
        if neutral_venue:
            venue = neutral_venue.text.strip()
            stadium = neutral_venue['title'].replace("Wembley", 'Wembley Stadium')
        
        game_record = {
            "season": season_years,
            "game_date": game_date,
            "opposition": opposition,
            "venue": venue,
            "score": score,
            "home_team": home_team,
            "away_team": away_team,
            "outcome": outcome,
            "home_goals": home_goals,
            "away_goals": away_goals,
            "secondary_score": "",
            "competition": competition,
            "goals_for": goals_for,
            "goals_against": goals_against,
            "source_url": url,
            "stadium": stadium
        }

        results.append(game_record)

df = pd.DataFrame(results)
df.game_date = pd.to_datetime(df.game_date)

### Read in `national_league_attendances.csv` and join `attendance` to `df`

In [12]:
attendances = pd.read_csv("../data/national_league_attendances.csv", parse_dates=['game_date'])
df = df.merge(attendances, on="game_date")

### Output `df` to CSV

In [13]:
df.to_csv("../data/soccerbase_nat_league_results.csv", index=False)