In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np


#Initialize
team_id_pattern = r"\/verein\/(\d+)\/saison_id\/\d+"
player_id_pattern = r"\/profil\/spieler\/(\d+)"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def cleaner(string:str)->str:
    string = string.replace("\xa0"," ").replace("\n"," ").replace("  ","")
    return string.strip()

def team_id_extractor(url_inp:str)->str:
    match = re.search(team_id_pattern, url_inp)
    team_id = '0'
    if match:
        team_id = match.group(1)
    return(team_id)

def player_id_extractor(url_inp:str)->str:
    match = re.search(player_id_pattern, url_inp)
    player_id = '0'
    if match:
        player_id = match.group(1)
    return(player_id)

with open('links.txt', 'r') as file:
    urls = [line.strip() for line in file.readlines()]
    
titles = ['Player_id', 'Season','Date','Left','Joined','MV','Fee','origin_id','destination_id'] 
players_transfers = pd.DataFrame(columns=titles)

class_list = ["grid__cell grid__cell--center tm-player-transfer-history-grid__season",
              'grid__cell grid__cell--center tm-player-transfer-history-grid__date',
              "grid__cell grid__cell--center tm-player-transfer-history-grid__old-club",
              "grid__cell grid__cell--center tm-player-transfer-history-grid__new-club",
              "grid__cell grid__cell--center tm-player-transfer-history-grid__market-value",
              "grid__cell grid__cell--center tm-player-transfer-history-grid__fee"]

for count,url in enumerate((urls)):
    print(f"{count+1}/{len(urls)}", end="\r")
    player_id = player_id_extractor(url)
    url = url.replace("profil", "transfers")
    html_content = requests.get(url,headers=HEADERS).text
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    # soup.find_all("div",class_="grid__cell grid__cell--center tm-player-transfer-history-grid__market-value")

    rows = []
    table = soup.find_all('div' , class_= "box viewport-tracking")[0]
    for i in table.find_all('div' , class_="grid tm-player-transfer-history-grid"):
        cells = []
        cells.append(player_id)
        for j in range(6):
            cells.append(cleaner(i.find('div' , class_=class_list[j]).text))
        
        try:
            cells.append(team_id_extractor(i.find_all('a' , class_='tm-player-transfer-history-grid__club-link')[0].get('href')))
        except:
            cells.append('0')
        
        try:
            cells.append(team_id_extractor(i.find_all('a' , class_='tm-player-transfer-history-grid__club-link')[1].get('href')))
        except: 
            cells.append('0')
        rows.append(cells)
    df = pd.DataFrame(rows, columns=titles)
    players_transfers = players_transfers.append(df, ignore_index=True)

2197/2197

In [7]:
players_transfers

Unnamed: 0,Player_id,Season,Date,Left,Joined,MV,Fee,origin_id,destination_id
0,434207,22/23,"May 31, 2023",Hull City,Brighton,€5.00m,End of loan,3008,1237
1,434207,22/23,"Jan 6, 2023",Brighton,Hull City,€5.00m,loan transfer,1237,3008
2,434207,22/23,"Jan 5, 2023",Venezia,Brighton,€5.00m,End of loan,607,1237
3,434207,22/23,"Jul 14, 2022",Brighton,Venezia,€6.00m,loan transfer,1237,607
4,434207,21/22,"May 31, 2022",Middlesbrough,Brighton,€6.00m,End of loan,641,1237
...,...,...,...,...,...,...,...,...,...
22051,3455,01/02,"Jul 1, 2001",Malmö FF,Ajax,-,€7.80m,496,610
22052,3455,98/99,"Jan 1, 1999",Malmö FF U19,Malmö FF,-,-,7917,496
22053,3455,97/98,"Jan 1, 1998",Malmö FF U17,Malmö FF U19,-,-,40593,7917
22054,3455,95/96,"Jul 1, 1995",FBK Balkan,Malmö FF U17,-,?,1166,40593


In [6]:
players_transfers.to_json('players_transfers.json', orient='records', indent=4)