I will be scraping per game stats and advanced stats from www.basketball-reference.com from seasons 1979/80 until the current one.
Then the MVP winners and DPOY winners from Wikipedia.

In [None]:
# importing the necessary libraries
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [None]:
# URL of stats per game
url = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"

# now we create soup object
html = urlopen(url)
soup = BeautifulSoup(html)

# use findALL() to get the column headers
soup.findAll('tr', limit=2)

# use getText() to extract the text we need into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]

In [None]:
# we will scrape stats since 1979/80 season
years = [*range(1980,2023)]

# create empty df first
df_per_game = pd.DataFrame(columns=headers)
df_per_game['Year'] = None

# now we iterate over the years, get the stats and concatenate to the df
for year in tqdm(years):
    # URL page we will scraping
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)

    html = urlopen(url)
    soup = BeautifulSoup(html)

    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    
    stats = pd.DataFrame(player_stats, columns=headers)
    stats['Year'] = year
    df_per_game = pd.concat([df_per_game,stats])

In [None]:
# do the same for advanced stats
url_advanced = "https://www.basketball-reference.com/leagues/NBA_2022_advanced.html"

html = urlopen(url_advanced)
soup = BeautifulSoup(html)

soup.findAll('tr', limit=2)

headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
headers = headers[1:]

In [None]:
df_advanced = pd.DataFrame(columns=headers)
df_advanced['Year'] = None

for year in tqdm(years):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)

    html = urlopen(url)
    soup = BeautifulSoup(html)

    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    
    stats = pd.DataFrame(player_stats, columns=headers)
    stats['Year'] = year
    df_advanced = pd.concat([df_advanced,stats])

df_advanced.columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'empty1',
       'OWS', 'DWS', 'WS', 'WS/48', 'empty2', 'OBPM', 'DBPM', 'BPM', 'VORP',
       'Year']

# drop unnecessary columns, which we have in the per-game df
df_advanced.drop(columns=['Player','Pos','Age','Tm','G','MP','empty1','empty2','Year'], inplace=True)

In [None]:
# now we join the per-game and advanced stats together
df = pd.concat([df_per_game,df_advanced], axis=1)

In [None]:
# little cleaning of the player name column
df.dropna(subset=['Player'], inplace=True)
df['Player'] = df['Player'].str.split('*').str[0]

In [None]:
# let's see what it looks like
df.head()

In [None]:
# now let's fet the mvp winners from wikipedia
mvp_winners = pd.read_html('https://en.wikipedia.org/wiki/NBA_Most_Valuable_Player_Award#Winners')[6]

# again some data cleaning
mvp_winners['Season'] = mvp_winners['Season'].str.split('–').str[0]
mvp_winners['Season'] = mvp_winners['Season'].astype(int) + 1
mvp_winners['Player'] = mvp_winners['Player'].str.split('*').str[0]
mvp_winners['Player'] = mvp_winners['Player'].str.split('^').str[0]

# we get the winners from season 1979/80 onwards
mvp_winners = mvp_winners[mvp_winners['Season'] > 1979].copy()

# assign 1 which we then merge to the stats df
mvp_winners['MVP'] = 1
mvp_winners.head()

In [None]:
# now we merge the MVP winners df
df = df.merge(mvp_winners[['Player','Season','MVP']], left_on=['Player','Year'], right_on=['Player','Season'], how='left')

# data cleaning
df.drop(columns='Season', inplace=True)
df['MVP'].fillna(0, inplace=True)
df['MVP'] = df['MVP'].astype(int)

In [None]:
# same for dpoy winners
dpoy_winners = pd.read_html('https://en.wikipedia.org/wiki/NBA_Defensive_Player_of_the_Year_Award')[6]
dpoy_winners['Season'] = dpoy_winners['Season'].str.split('–').str[0]
dpoy_winners['Season'] = dpoy_winners['Season'].astype(int) + 1
dpoy_winners['Player'] = dpoy_winners['Player'].str.split('*').str[0]
dpoy_winners['Player'] = dpoy_winners['Player'].str.split('^').str[0]
dpoy_winners['Player'] = dpoy_winners['Player'].str.split('[').str[0]
dpoy_winners = dpoy_winners[dpoy_winners['Season'] > 1979].copy()
dpoy_winners['DPOY'] = 1
dpoy_winners.head()

In [None]:
# merge the dpoy winners
df = df.merge(dpoy_winners[['Player','Season','DPOY']], left_on=['Player','Year'], right_on=['Player','Season'], how='left')
df.drop(columns='Season', inplace=True)

df['DPOY'].fillna(0, inplace=True)
df['DPOY'] = df['DPOY'].astype(int)

In [None]:
# change the numeric columns to numeric dtypes
numeric_columns = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'MVP', 'DPOY']

for column in numeric_columns:
    df[column] = pd.to_numeric(df[column])

In [None]:
# write to csv
df.to_csv('data/stats.csv', index=False)