In [16]:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
from random import randint

In [17]:
# Define a dictionary headers to store the User-Agent string for the request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}

In [18]:
# Read the player links from the csv file
df_links = pd.read_csv('output/player_links.csv')

# Convert the 'URL' column of the dataframe to a list
links = df_links['URL'].tolist()

In [19]:
# List to store data of each player in the form of a dictionary
list_of_row_dicts = []

# Loop through each link to scrape player data
for link in links:

    # Keep trying until the request is successful
    while True:
        try:
            # Make a GET request to the link
            request = rq.get(link,headers=headers)
        
            # Use BeautifulSoup to parse the HTML content of the page
            soup = bs(request.text, 'html.parser')

            # Find all <span> tags in the HTML
            title_spans = soup.find_all('span')
                       
            # Break out of the loop if the request is successful
            break
        except AttributeError:
            # Print error message and wait for 10 seconds before retrying
            print('Index Error : Sleeping for 10 seconds before retrying')
            sleep(10)
            
    try:
        name = ' '.join([word for word in soup.find('h1').text.split() if not any(i.isdigit() for i in word)]).strip()
    except IndexError:
        name = None
        print ('Name not found')

    try:
        date_of_birth = [span.find_next('span').text for span in title_spans if 'Date of birth/Age:' in span.text][0].strip()
    except IndexError:
        date_of_birth = None
        print ('DoB not found')

    try:
        citizenship = [span.find_next('span').text.strip().split('\xa0\xa0') for span in title_spans if 'Citizenship:' in span.text][0]
    except AttributeError:
        citizenship = None
        print ('Citizenship not found')

    try:
        current_club = [span.find_next('span').text for span in title_spans if 'Current club:' in span.text][0].strip()
    except IndexError:
        current_club = None
        print ('Current club not found')

    try:
        outfitter = [span.find_next('span').text for span in title_spans if 'Outfitter:' in span.text][0].strip()
    except IndexError:
        outfitter = None
        print ('Outfitter not found')

    try:
        mv = [span.next_sibling.strip() for span in title_spans if '€' in span.text][0].strip()
        korm = [span.find_next('span').text for span in title_spans if '€' in span.text][0].strip()
        mv = mv + korm
    except IndexError:
        mv = None
        print ('MV not found')
            
    row_dic = {
    'PLAYER_URL' : link,
    'NAME' : name,
    'DATE_OF_BIRTH' : date_of_birth,
    'CITIZENSHIP': citizenship,
    'CURRENT_CLUB' : current_club,
    'OUTFITTER' : outfitter,
    'MV' : mv
    }
    list_of_row_dicts.append(row_dic)
            
    print(name,',',date_of_birth,',',citizenship,',',current_club,',',outfitter,',',mv)
    sleep(randint(1,3))

df = pd.DataFrame(list_of_row_dicts)

Djordje Petrovic , Oct 8, 1999 (25) , ['Serbia'] , RC Strasbourg Alsace , adidas , 18.00m
Outfitter not found
Alaa Bellaarouch , Jan 1, 2002 (23) , ['Morocco'] , RC Strasbourg Alsace , None , 500k
Outfitter not found
Karl-Johan Johnsson , Jan 28, 1990 (35) , ['Sweden'] , RC Strasbourg Alsace , None , 400k
Outfitter not found
Mamadou Sarr , Aug 29, 2005 (19) , ['France', 'Senegal'] , RC Strasbourg Alsace , None , 15.00m
Outfitter not found
Ismaël Doukouré , Jul 24, 2003 (21) , ['France', "Cote d'Ivoire"] , RC Strasbourg Alsace , None , 15.00m
Outfitter not found
Abakar Sylla , Dec 25, 2002 (22) , ["Cote d'Ivoire"] , RC Strasbourg Alsace , None , 12.00m
Outfitter not found
Andrew Omobamidele , Jun 23, 2002 (22) , ['Ireland', 'Nigeria'] , RC Strasbourg Alsace , None , 10.00m
Outfitter not found
Yoni Gomis , Sep 23, 2005 (19) , ['France', 'Senegal'] , RC Strasbourg Alsace , None , 1.00m
Valentín Barco , Jul 23, 2004 (20) , ['Argentina'] , RC Strasbourg Alsace , adidas , 8.00m
Outfitter not

In [20]:
df.to_csv('output/player_info.csv')