In [1]:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
from random import randint

In [2]:
# Define a dictionary headers to store the User-Agent string for the request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}

In [3]:
# Read the player links from the csv file
df_links = pd.read_csv('output/player_links.csv')

# Convert the 'URL' column of the dataframe to a list
links = df_links['URL'].tolist()

In [None]:
# List to store data of each player in the form of a dictionary
list_of_row_dicts = []

# Loop through each link to scrape player data
for link in links:

    # Keep trying until the request is successful
    while True:
        try:
            # Make a GET request to the link
            request = rq.get(link,headers=headers)
            
            # Extraire les morceaux utiles
            parts = link.split('/')
            player_slug = parts[3]
            player_id = parts[-1]
            
            # Construire l'URL de stats détaillées Ligue 1 saison 2024
            competition = "FR1"
            season = "2024"        
            detailed_url = f"https://www.transfermarkt.com/{player_slug}/leistungsdatendetails/spieler/{player_id}/wettbewerb/{competition}/saison/{season}"
            request_detailed = rq.get(detailed_url, headers=headers)
        
            # Use BeautifulSoup to parse the HTML content of the page
            soup = bs(request.text, 'html.parser')
            soup_detailed = bs(request_detailed.text, 'html.parser')

            # Find all <span> tags in the HTML
            title_spans = soup.find_all('span')

            table = soup_detailed.find('table', class_='items')
            rows = table.find_all('tr', {'class': ['odd', 'even']})
                       
            # Break out of the loop if the request is successful
            break
        except AttributeError:
            # Print error message and wait for 10 seconds before retrying
            print('Index Error : Sleeping for 10 seconds before retrying')
            sleep(10)
            
    try:
        name = ' '.join([word for word in soup.find('h1').text.split() if not any(i.isdigit() for i in word)]).strip()
    except IndexError:
        name = None
        print ('Name not found')

    try:
        date_of_birth = [span.find_next('span').text for span in title_spans if 'Date of birth/Age:' in span.text][0].strip()
    except IndexError:
        date_of_birth = None
        print ('DoB not found')

    try:
        citizenship = [span.find_next('span').text.strip().split('\xa0\xa0') for span in title_spans if 'Citizenship:' in span.text][0]
    except AttributeError:
        citizenship = None
        print ('Citizenship not found')

    try:
        current_club = [span.find_next('span').text for span in title_spans if 'Current club:' in span.text][0].strip()
    except IndexError:
        current_club = None
        print ('Current club not found')

    try:
        outfitter = [span.find_next('span').text for span in title_spans if 'Outfitter:' in span.text][0].strip()
    except IndexError:
        outfitter = None
        print ('Outfitter not found')

    try:
        mv = [span.next_sibling.strip() for span in title_spans if '€' in span.text][0].strip()
        korm = [span.find_next('span').text for span in title_spans if '€' in span.text][0].strip()
        mv = mv + korm
    except IndexError:
        mv = None
        print ('MV not found')

    try:
        # Récupérer toutes les <td>
        for row in rows:
            cells = row.find_all('td')   
            # Le nombre de buts est à l'index 4 (commence à 0)
            matchs = cells[3].text.strip()
            goals = cells[4].text.strip()
            assists = cells[5].text.strip()
    except IndexError:
        goals = None
        print('Stats not found')
            
    row_dic = {
    'PLAYER_URL' : link,
    'NAME' : name,
    'DATE_OF_BIRTH' : date_of_birth,
    'CITIZENSHIP': citizenship,
    'CURRENT_CLUB' : current_club,
    'OUTFITTER' : outfitter,
    'MV' : mv,
    'MATCHS' : matchs,
    'GOALS' : goals,
    'ASSISTS' : assists
    }
    list_of_row_dicts.append(row_dic)
            
    print(name,',',date_of_birth,',',citizenship,',',current_club,',',outfitter,',',mv,',',matchs,',',goals,',',assists)
    sleep(randint(1,3))

df = pd.DataFrame(list_of_row_dicts)

Lucas Perri , Dec 10, 1997 (27) , ['Brazil', 'Italy'] , Olympique Lyon , adidas , 10.00m , 28 , - , 1 / - / -
Outfitter not found
Rémy Descamps , Jun 25, 1996 (28) , ['France'] , Olympique Lyon , None , 1.50m , - , - , - / - / -
Index Error : Sleeping for 10 seconds before retrying
Index Error : Sleeping for 10 seconds before retrying
Index Error : Sleeping for 10 seconds before retrying


In [132]:
df.to_csv('output/player_info_details.csv')