## Scrape historical results from 11v11

In [1]:
from bs4 import BeautifulSoup
import datetime as dt
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as BraveService
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from urllib.parse import urljoin

url = 'https://www.11v11.com/teams/tranmere-rovers/tab/matches/'

options = webdriver.ChromeOptions()
options.binary_location = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
driver = webdriver.Chrome(options=options, service=BraveService(ChromeDriverManager(chrome_type=ChromeType.BRAVE).install()))

driver.get(url)

doc = BeautifulSoup(driver.page_source, 'lxml')

seasons = doc.select('#season li a')

results = []

for season in seasons:
    season_years = season.text
    season_url = season['href']

    driver.get(season_url)

    doc = BeautifulSoup(driver.page_source, 'lxml')

    season_title = doc.select_one('.seasonTitle')
    matches = season_title.find_next_sibling('table').select("tbody tr")

    for match in matches:
        season_years = season_years.replace('-', '/')

        game_info = match.select('td')
        
        date = game_info[0].get_text().strip()
        date = dt.datetime.strptime(date, '%d %b %Y')

        teams = game_info[1].text.strip()
        team_names = teams.split(' v ')
        home_team = team_names[0]
        away_team = team_names[1]

        game_url = game_info[1].select_one('a')['href']
        game_url = urljoin(season_url ,game_url)

        outcome = game_info[2].text

        score = game_info[3].get_text().strip().replace(' (', '(')
        score = score.replace('Agg: ', ' ')
        score = score.replace('(', ' (')
        score = score.split(' ')
        
        primary_score = score[0].split('-')

        home_goals = primary_score[0]
        away_goals = primary_score[1]

        try:
            secondary_score = score[1].replace('(', ' (').strip()
            secondary_score = secondary_score.replace("(", "").replace(")", "")
        except:
            secondary_score = ''
        
        if home_team == 'Tranmere Rovers':
            goals_for = home_goals
            goals_against = away_goals
            venue = 'H'
            opposition = away_team
        else:
            goals_for = away_goals
            goals_against = home_goals
            venue = 'A'
            opposition = home_team

        score = f'{goals_for}-{goals_against}'

        competition = game_info[4].text.strip()

        driver.get(game_url)

        doc = BeautifulSoup(driver.page_source, 'lxml')

        game_panel = doc.select_one('.basicData')
        rows = game_panel.select('tr')

        # Intializing these outside loop to try and stop annoying error
        stadium = 'Unknown'
        attendance = ''
        for row in rows:
            columns = row.select('td')

            column_title = columns[0].text.strip()
            if column_title == 'Venue':
                stadium = columns[1].text.strip()
            else:
                next
                
            if column_title == 'Attendance':
                attendance = columns[1].text.strip()
                attendance = attendance.replace(',', '')
            else:
                next
        
        game_record = {
            'season': season_years,
            'game_date': date,
            'opposition': opposition,
            'venue': venue,
            'score': score,
            'home_team': home_team,
            'away_team': away_team,
            'outcome': outcome,
            'home_goals': home_goals,
            'away_goals': away_goals,
            'secondary_score': secondary_score,
            'competition': competition,
            'goals_for': goals_for,
            'goals_against': goals_against,
            'source_url': game_url,
            'attendance': attendance,
            'stadium': stadium
            }

        results.append(game_record)
driver.quit()

df = pd.DataFrame(results)
df.game_date = pd.to_datetime(df.game_date)

### Remove National League seasons to avoid duplicates with Soccerbase

In [None]:
nl_seasons = ["2015/16", "2016/17", "2017/18"]
df = df[~df.season.isin(nl_seasons)]

### Output dataframe to csv

In [29]:
df.to_csv("../data/11v11_results.csv", index=False)