In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def ball_possession(leagues_name, countries, years):
    Ball_possession = []
    for league, country in zip(leagues_name, countries):
        for year in years:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
            }
            page = requests.get(
                "https://www.transfermarkt.com/"
                + league
                + "/ballbesitz/wettbewerb/"
                + country
                + "/saison_id/"
                + str(year),
                headers=headers,
            )

            soup = BeautifulSoup(page.content, "html.parser")

            for name in ["odd", "even"]:
                ## Find table that all clubs are sorted by thier rank in that year
                s = soup.find("div", class_="large-8 columns")
                for team in s.find_all("tr", class_=name):
                    team_name = (
                        team.find("td", class_="hauptlink no-border-links")
                        .find("a")
                        .text
                    )
                    ### find value of ball possession in a session for every club
                    possession = float(
                        team.find_all("td", class_="zentriert")[-1]
                        .text.split("%")[0]
                        .strip()
                    )

                    team_possession = {}
                    team_possession["year"] = int(year)
                    team_possession["team_name"] = team_name
                    team_possession["ball_possession"] = possession
                    Ball_possession.append(team_possession)
    return Ball_possession

In [3]:
def total_shot_rate(leagues_name, countries, years):
    total_shot_rate = []
    for league, country in zip(leagues_name, countries):
        for year in years:
            goal_records = {}
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
            }
            page = requests.get(
                "https://www.transfermarkt.com/"
                + league
                + "/chancenverwertung/wettbewerb/"
                + country
                + "/saison_id/"
                + str(year)
                + "/plus/1",
                headers=headers,
            )

            soup = BeautifulSoup(page.content, "html.parser")
            for name in ["odd", "even"]:
                teams = soup.find("div", class_="responsive-table")

                for team in teams.find_all("tr", class_=name):
                    team_name = (
                        team.find("td", class_="hauptlink no-border-links")
                        .find("a")
                        .text
                    )

                    total_shot = int(
                        team.find_all("td", class_="zentriert hauptlink")[0].text
                    )

                    total_shot_at_target = int(
                        team.find_all("td", class_="zentriert")[-2].text
                    )

                    goals = int(
                        team.find_all("td", class_="zentriert hauptlink")[1].text
                    )

                    goals_shot_ratio = float(
                        team.find("td", class_="rechts hauptlink")
                        .text.split("%")[0]
                        .strip()
                    )

                    goal_records = {}
                    goal_records["year"] = year
                    goal_records["team_name"] = team_name
                    goal_records["total_shot"] = total_shot
                    goal_records["total_shot_at_target"] = total_shot_at_target
                    goal_records["goals"] = goals
                    goal_records["goals_shot_ratio"] = goals_shot_ratio
                    total_shot_rate.append(goal_records)

    return total_shot_rate

In [None]:
leagues_names = ["premier-league", "laliga", "bundesliga", "serie-a", "ligue-1"]
countries = ["GB1", "ES1", "L1", "IT1", "FR1"]
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021]

Ball_possession = ball_possession(leagues_names, countries, years)
total_shot_rate = total_shot_rate(leagues_names, countries, years)

ball_possession_df = pd.DataFrame(Ball_possession)
ball_possession_df.to_csv("ball_possession.csv")

shot_df = pd.DataFrame(total_shot_rate)
shot_df.to_csv("shot_statistics.csv")