In [14]:
# Author: Mahmoud Zahran 

# In this notebook, we're scraping data from basketball-reference.com since their data cannot be easily downloaded. For that, 
# we're going to make use BeautifulSoup and regular expressions. 

import requests
from bs4 import BeautifulSoup 
import time
import pandas as pd
import re

In [15]:
# We have multiple url links from which we'd like to retrieve data, and they all have the following format:
# 'https://www.basketball-reference.com/awards/awards_{YEAR}.html#mvp', depending on what year it is.
# Therefore, in this part, we're generating all the url links for all years in the range [1956, 2022]. 
mvp_awards_urls = ['https://www.basketball-reference.com/awards/awards_{year}.html#mvp'.\
                   format(year=year) for year in range(1956, 2023)]

# Regular expression for extracting the season for which the data is listed.
season_regexp = re.compile(r'\d{4}\s*-\s*\d{2}')

# Using a sample url to setup the column names for the dataframe we're going to generate. 
sample_url = 'https://www.basketball-reference.com/awards/awards_1956.html#mvp' 

# Sending a get request to the webpage and retrieving the content.
sample_page = requests.get(sample_url).content

# Using BeautifulSoup as an html parser to parse the webpage's html. 
soup = BeautifulSoup(sample_page, parser='html')

# Lambda expression that simply extracts the text wrapped inside of a tag. 
aggregate_tag_texts = lambda tag: tag.text

# The column names for the dataframe (concatenating 'Season' onto the result of the filtered text labels
# scraped from the website). The column names are found in th_tags, which are in turn found inside tr_tags. 
columns = ['Season'] + [th_tag.text for th_tag in soup.find_all('tr')[1].find_all('th')]

# Creating an empty pandas dataframe with the obtained column names. 
mvps_w_share_df = pd.DataFrame(columns=columns)

# This is going to aggregate all the data from all the webpages. 
data = []

# Now, we go over each url link in the generated list of links, send a request, get the content, parse it, 
# extract the season (as done in the previous part), and finally, we retrieve the observations (rows) and add
# them to our variable 'data'.
for mvp_url in mvp_awards_urls:
    mvp_page = requests.get(mvp_url).content
    soup = BeautifulSoup(mvp_page, parser='html')
    season = re.search(season_regexp, soup.find('div', id='content').h1.text).group()
    
    # Finding all the rows/observations in the dataset.
    mvp_rows = soup.find('tbody').find_all('tr')
    
    # Extracting the data from each row in the list of rows.
    new_data = list(map(lambda tr_row: [tr_row.find('th').text] + \
                        list(map(aggregate_tag_texts, tr_row.find_all('td'))), mvp_rows))
    
    # Adding the season information to each observation.
    new_data_w_season = list(map(lambda row: [season] + row, new_data))

    # Adding the result to our data. 
    data += new_data_w_season
    
    # Sleeping for 2 seconds to avoid sending too many requests simultaneously (results in a ban).
    time.sleep(2)

In [16]:
# Updating the dataframe to contain the retrieved data. 
mvps_w_share_df = pd.DataFrame(data=data, columns=columns)

mvps_w_share_df

Unnamed: 0,Season,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,...,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1955-56,1,Bob Pettit,23,STL,33.0,33.0,80,0.413,72,...,25.7,16.2,2.6,,,.429,,.736,13.8,.236
1,1955-56,2,Paul Arizin,27,PHW,21.0,21.0,80,0.263,72,...,24.2,7.5,2.6,,,.448,,.810,12.2,.214
2,1955-56,3,Bob Cousy,27,BOS,11.0,11.0,80,0.138,72,...,18.8,6.8,8.9,,,.360,,.844,6.8,.119
3,1955-56,4,Mel Hutchins,27,FTW,9.0,9.0,80,0.113,66,...,12.0,7.5,2.7,,,.425,,.643,4.4,.095
4,1955-56,5T,Dolph Schayes,27,SYR,2.0,2.0,80,0.025,72,...,20.4,12.4,2.8,,,.387,,.858,11.8,.225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,2021-22,8,Stephen Curry,33,GSW,0.0,4.0,1000,0.004,64,...,25.5,5.2,6.3,1.3,0.4,.437,.380,.923,8.0,.173
1019,2021-22,9,Chris Paul,36,PHO,0.0,2.0,1000,0.002,65,...,14.7,4.4,10.8,1.9,0.3,.493,.317,.837,9.4,.210
1020,2021-22,10T,DeMar DeRozan,32,CHI,0.0,1.0,1000,0.001,76,...,27.9,5.2,4.9,0.9,0.3,.504,.352,.877,8.8,.154
1021,2021-22,10T,Kevin Durant,33,BRK,0.0,1.0,1000,0.001,55,...,29.9,7.4,6.4,0.9,0.9,.518,.383,.910,8.4,.198


In [17]:
# As apparent from the dataframe above, tie rankings are appended with a 'T' at the end of the rank number. In this part, we're
# going to remove that 'T' because that's going to aid in being able to sort the data. 

# Regex for finding out if a rank includes a 'T'
tie_ranking_regexp = re.compile(r'(\d+)T')

# Iterating over all rank values and formatting them properly by removing the 'T' if applicable, using map. 
ranks = list(map(lambda rank: re.match(tie_ranking_regexp, rank).groups(0)[0] if \
         re.match(tie_ranking_regexp, rank) else rank, mvps_w_share_df['Rank']))

# Casting the type of ranks to int (because we're going to use that to sort the data). 
int_ranks = [int(rank) for rank in ranks]

# Updating the rank column in the dataframe. 
mvps_w_share_df['Rank'] = int_ranks

# Sorting the dataframe by Season then Rank (in descending order for Season and ascending for Rank) to get the top rankings for each
# season (1, 2, 3, etc.), with the most recent seasons being located at the top of the dataframe. 
mvps_w_share_df.sort_values(by=['Season', 'Rank'], ascending=[False, True], inplace=True)

mvps_w_share_df

Unnamed: 0,Season,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,...,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
1011,2021-22,1,Nikola Jokić,26,DEN,65.0,875.0,1000,0.875,74,...,27.1,13.8,7.9,1.5,0.9,.583,.337,.810,15.2,.296
1012,2021-22,2,Joel Embiid,27,PHI,26.0,706.0,1000,0.706,68,...,30.6,11.7,4.2,1.1,1.5,.499,.371,.814,12.0,.252
1013,2021-22,3,Giannis Antetokounmpo,27,MIL,9.0,595.0,1000,0.595,67,...,29.9,11.6,5.8,1.1,1.4,.553,.293,.722,12.9,.281
1014,2021-22,4,Devin Booker,25,PHO,0.0,216.0,1000,0.216,68,...,26.8,5.0,4.8,1.1,0.4,.466,.383,.868,7.6,.156
1015,2021-22,5,Luka Dončić,22,DAL,0.0,146.0,1000,0.146,65,...,28.4,9.1,8.7,1.2,0.6,.457,.353,.744,7.6,.159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,1955-56,4,Mel Hutchins,27,FTW,9.0,9.0,80,0.113,66,...,12.0,7.5,2.7,,,.425,,.643,4.4,.095
4,1955-56,5,Dolph Schayes,27,SYR,2.0,2.0,80,0.025,72,...,20.4,12.4,2.8,,,.387,,.858,11.8,.225
5,1955-56,5,Bill Sharman,29,BOS,2.0,2.0,80,0.025,72,...,19.9,3.6,4.7,,,.438,,.867,8.8,.157
6,1955-56,7,Tom Gola,23,PHW,1.0,1.0,80,0.013,68,...,10.8,9.1,5.9,,,.412,,.733,6.5,.132


In [18]:
# Saving/exporting the dataframe. 
mvps_w_share_df.to_csv('mvps_w_share_df.csv', index=False)