In [1]:
## Imports
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
## Fetching webpage with `Requests`
# URL: `https://www.bbc.com/sport/football/premier-league/top-scorers`
# GET request
url = "https://www.bbc.com/sport/football/premier-league/top-scorers"
response = requests.get(url)

In [3]:
# check for errors : return none for no error
print(response.raise_for_status())

None


In [4]:
# status code
response.status_code

200

In [5]:
# text (string format) : first 200 characters
response.text[:200]

'<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title data-rh="true">Premier League Top Scorers - B'

In [6]:
type(response.text)

str

In [7]:
# content (binary format) : first 200 bytes
response.content[:200]

b'<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title data-rh="true">Premier League Top Scorers - B'

In [8]:
type(response.content)

bytes

In [9]:
# can use either `content` or `text` attribute to create the soup object
# with `text`, we must be certain about the encoding
# with `content`, we let Beautiful Soup handle the encoding mostly (thus error free)
# soup objec
soup = BeautifulSoup(response.content, "html.parser")

In [10]:
# This will print the code behind the website of the above-url
# print(soup.prettify())

In [11]:
## HTML parsing with `Beautiful Soup`: A Mini Project
# scrape data from a public website
# organize the data as a dataframe
# export the data as an excel sheet
# all the list we want
player_names = []
team_names = []
goals = []
assists = []
num_matches = []
shots = []

In [12]:
try:
	response = requests.get(url)
	response.raise_for_status()
except Exception as e:
	print(e)
else:
	soup = BeautifulSoup(response.content, 'html.parser')
	players = soup.find('tbody').find_all('tr', class_='ssrcss-dhlz6k-TableRowBody e1icz100') # using class_ in place of class as it is reserved keyword 
	for player in players: # .text().strip() and .get_text(strip=True) same
		player_name = player.find('div', class_='ssrcss-m6ah29-PlayerName e1n8xy5b1').get_text(strip=True)
		team_name = player.find('div', class_='ssrcss-qvpga1-TeamsSummary e1n8xy5b0').get_text(strip=True)
		goals_scored = int(player.find('div', 'ssrcss-8k20kk-CellWrapper ef9ipf0').get_text(strip=True))

		stats = player.find_all('div', class_='ssrcss-150z8d-CellWrapper ef9ipf0')
		assists_made = int(stats[0].get_text(strip=True))
		matches_played = int(stats[2].get_text(strip=True))
		shots_taken = int(stats[-3].get_text(strip=True))
		
		player_names.append(player_name)
		team_names.append(team_name)
		goals.append(goals_scored)
		assists.append(assists_made)
		num_matches.append(matches_played)
		shots.append(shots_taken)

	data = {
		'player': player_names,
		'team': team_names,
		'matches': num_matches,
		'goals': goals,
		'assists': assists,
		'shots': shots
	}
	df_players = pd.DataFrame(data)

In [13]:
df_players

Unnamed: 0,player,team,matches,goals,assists,shots
0,Mohamed Salah,Liverpool,29,27,17,106
1,E. Haaland,Man City,28,21,3,102
2,A. Isak,Newcastle,25,19,5,69
3,C. Wood,Nottm Forest,29,18,3,53
4,B. Mbeumo,Brentford,29,15,5,62
5,C. Palmer,Chelsea,28,14,6,102
6,Y. Wissa,Brentford,26,14,2,63
7,O. Watkins,Aston Villa,29,13,6,72
8,Matheus Cunha,Wolves,26,13,4,86
9,J. Kluivert,Bournemouth,28,12,6,56


In [14]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   player   39 non-null     object
 1   team     39 non-null     object
 2   matches  39 non-null     int64 
 3   goals    39 non-null     int64 
 4   assists  39 non-null     int64 
 5   shots    39 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 2.0+ KB


In [15]:
df_players.describe()

Unnamed: 0,matches,goals,assists,shots
count,39.0,39.0,39.0,39.0
mean,26.153846,10.102564,4.076923,58.153846
std,2.611094,4.627122,2.7471,19.18818
min,19.0,7.0,0.0,31.0
25%,25.0,7.0,3.0,44.0
50%,27.0,8.0,4.0,52.0
75%,28.0,12.0,5.0,64.5
max,29.0,27.0,17.0,106.0


In [16]:
df_players.to_excel('BBCSports-Top-Scorers.xlsx', index=False)