In [57]:
from bs4 import BeautifulSoup
from lxml import html
import requests
import random

# List of URLs to scrape with the year as the key and the URL as the value
pages_to_scrape = {"2022-2023": "https://sofifa.com/teams?type=all&lg%5B0%5D=13&r=230054&set=true&showCol%5B0%5D=oa&showCol%5B1%5D=sa&showCol%5B2%5D=at&showCol%5B3%5D=md&showCol%5B4%5D=df&col=oa&sort=desc&hl=en-US",
                   "2021-2022": "https://sofifa.com/teams?type=all&lg%5B0%5D=13&r=220069&set=true&showCol%5B0%5D=oa&showCol%5B1%5D=sa&showCol%5B2%5D=at&showCol%5B3%5D=md&showCol%5B4%5D=df&col=oa&sort=desc&hl=en-US",
                   "2020-2021": "https://sofifa.com/teams?type=all&lg%5B0%5D=13&r=210064&set=true&showCol%5B0%5D=oa&showCol%5B1%5D=sa&showCol%5B2%5D=at&showCol%5B3%5D=md&showCol%5B4%5D=df&col=oa&sort=desc&hl=en-US",
                   "2019-2020": "https://sofifa.com/teams?type=all&lg%5B0%5D=13&r=200061&set=true&showCol%5B0%5D=oa&showCol%5B1%5D=sa&showCol%5B2%5D=at&showCol%5B3%5D=md&showCol%5B4%5D=df&col=oa&sort=desc&hl=en-US",
                   "2018-2019": "https://sofifa.com/teams?type=all&lg%5B0%5D=13&r=190075&set=true&showCol%5B0%5D=oa&showCol%5B1%5D=sa&showCol%5B2%5D=at&showCol%5B3%5D=md&showCol%5B4%5D=df&col=oa&sort=desc&hl=en-US",
                   "2017-2018": "https://sofifa.com/teams?type=all&lg%5B0%5D=13&r=180084&set=true&showCol%5B0%5D=oa&showCol%5B1%5D=sa&showCol%5B2%5D=at&showCol%5B3%5D=md&showCol%5B4%5D=df&col=oa&sort=desc&hl=en-US"}

# Define a function to check the status code of the page
def check_status_code(page_to_scrape):
    if page_to_scrape.status_code == 200:
        print('Page was successfully scraped!\n')
    elif page_to_scrape.status_code == 404:
        print('Page was not found!\n')
    elif page_to_scrape.status_code == 500:
        print('Internal server error!\n')
    elif page_to_scrape.status_code == 403:
        print('Access denied!\n')
    else:
        print('Unknown error!\n')

# Define a list of user agents we can rotate through to avoid getting blocked 
user_agent_list = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]

In [60]:
# Global list to store the data
seasonal_team_rating = []

# Loop through the pages to scrape
for season, page in pages_to_scrape.items():
    # Pick a random user agent
    headers = {"User-Agent": user_agent_list[random.randint(0, len(user_agent_list)-1)]}

    # Get the page to scrape
    page_to_scrape = requests.get(page, headers=headers)

    # Check if the page was successfully scraped
    check_status_code(page_to_scrape)

    # Create a BeautifulSoup object and parse the HTML
    soup = BeautifulSoup(page_to_scrape.text, 'lxml')

    # Convert the BeautifulSoup object into an lxml object
    tree = html.fromstring(str(soup))

    # Use the full XPath to find the data cards we want
    data = tree.xpath('/html/body/div[1]/div/div[2]/div/table/tbody')

    # Get the list of individual data cards
    cards = data[0].xpath('tr')
    
    # Define variables to store the data
    team_name = ""
    team_overall = ""

    # Loop through each data card and extract the data
    print(f"Season: {season}")
    for card in cards:
        # Get the team name
        team_name = card[1][0].text_content()
        
        # Get the team overall rating
        team_overall = card[2].text_content()
        
        print(f"{team_name}, {team_overall}")
    
        # Append the data to the global list
        seasonal_team_rating.append({"Season": season, "Team": team_name, "Rating": team_overall})
        

Page was successfully scraped!

Season: 2022-2023
Manchester City, 85
Liverpool, 84
Arsenal, 82
Chelsea, 82
Manchester United, 82
Tottenham Hotspur, 81
Newcastle United, 80
Aston Villa, 79
West Ham United, 78
Leicester City, 78
Wolverhampton Wanderers, 78
Nottingham Forest, 77
Brighton & Hove Albion, 77
Everton, 76
Crystal Palace, 76
Fulham, 76
Leeds United, 75
Southampton, 75
Brentford, 75
AFC Bournemouth, 74
Page was successfully scraped!

Season: 2021-2022
Liverpool, 85
Manchester City, 85
Chelsea, 84
Manchester United, 83
Tottenham Hotspur, 81
Arsenal, 80
Leicester City, 80
West Ham United, 79
Aston Villa, 78
Everton, 78
Wolverhampton Wanderers, 78
Newcastle United, 77
Burnley, 76
Crystal Palace, 76
Leeds United, 76
Brighton & Hove Albion, 76
Southampton, 76
Watford, 74
Brentford, 74
Norwich City, 73
Page was successfully scraped!

Season: 2020-2021
Liverpool, 85
Manchester City, 85
Chelsea, 82
Manchester United, 82
Tottenham Hotspur, 82
Arsenal, 80
Leicester City, 80
Everton, 79
W

In [69]:
import pandas as pd

# Convert our list of dictionaries into a Pandas DataFrame
df = pd.DataFrame(seasonal_team_rating)

# Save the DataFrame to a CSV file
df.to_csv("Dataset/seasonal_team_rating.csv", index=False)