# Web Scraping Basketball Reference Data

I will be using the website Basketball-reference.com in order to scrap data of the players from multiple different seasons. This website provides clean sport data, thus making it easy to scrape the data and create my own database from the data gathered on the website.

Now to import all the libraries we would need in order to begin web scarping the data from the website. 

In [1]:
# import needed libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

We will now create the function to gather data from the website. It will take in the year we want for that season. 

In [19]:
# function to gather data from the web parameter is the year
def scrape_nba_data(years):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(years)

    try:
        html = urlopen(url)
        soup = BeautifulSoup(html, features="lxml")
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers.remove('Rk')

    rows = soup.findAll('tr')[1:]
    rows_data = [[td.getText() for td in rows[i].findAll('td')] 
                    for i in range(len(rows))]
    
    df = pd.DataFrame(rows_data, columns=headers)
    df.dropna(axis = 0, inplace=True)
    df = df.reset_index(drop=True)

    # Replace empty string with '0'
    df.replace('', '0', inplace=True)   

    # convert numeric columns to float
    numeric_columns = df.columns.drop(['Player', 'Pos', 'Team'])
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

    # Add 'Year' column
    df['Year'] = years

    # Drop awards column
    if 'Awards' in df.columns:
        df.drop('Awards', axis=1, inplace=True)

    return df

Now that we created the function to be able to gather data let's test it out by looking at the 2020 - 2021 seaon.

In [20]:
# Test to make sure the function works
NBA_season_2020_2021 = scrape_nba_data(2021)
NBA_season_2020_2021

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Stephen Curry,32,GSW,PG,63,63,34.2,10.4,21.7,0.482,...,0.5,5.0,5.5,5.8,1.2,0.1,3.4,1.9,32.0,2021
1,Bradley Beal,27,WAS,SG,60,60,35.8,11.2,23.0,0.485,...,1.2,3.5,4.7,4.4,1.2,0.4,3.1,2.3,31.3,2021
2,Damian Lillard,30,POR,PG,67,67,35.8,9.0,19.9,0.451,...,0.5,3.7,4.2,7.5,0.9,0.3,3.0,1.5,28.8,2021
3,Joel Embiid,26,PHI,C,51,51,31.1,9.0,17.6,0.513,...,2.2,8.4,10.6,2.8,1.0,1.4,3.1,2.4,28.5,2021
4,Giannis Antetokounmpo,26,MIL,PF,61,61,33.0,10.3,18.0,0.569,...,1.6,9.4,11.0,5.9,1.2,1.2,3.4,2.8,28.1,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,Miye Oni,23,UTA,SG,8,0,5.1,0.0,0.6,0.000,...,0.4,0.0,0.4,0.0,0.1,0.0,0.0,1.0,0.0,2021
942,Matt Thomas,26,UTA,SG,3,0,2.3,0.0,0.7,0.000,...,0.0,0.3,0.3,0.0,0.0,0.0,0.0,0.3,0.0,2021
943,Isaac Bonga,21,WAS,SF,4,0,2.5,0.0,1.3,0.000,...,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,2021
944,Anthony Gill,28,WAS,PF,4,0,8.3,0.0,0.8,0.000,...,0.3,0.8,1.0,0.0,0.0,0.0,0.8,1.8,0.0,2021


In [8]:
pip install mysql-connector-python

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Now to create a database in order to hold all the data we just got from Basketball Reference

In [30]:
import mysql.connector 

# Establish the connection
database = mysql.connector.connect(user='root', password='Xnomxslayer123!@#', host='localhost')

# Create a cursor object
cursor = database.cursor()

# Create the new database
cursor.execute("CREATE DATABASE IF NOT EXISTS nba_database")

# Use the new database
cursor.execute("USE nba_database")

Now we will create the table that we will to store the data.

In [31]:
cursor.execute("DROP TABLE IF EXISTS player_stats")

In [32]:
NBA_season_2020_2021.columns

Index(['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year'],
      dtype='object')

In [33]:
# Create a new table
cursor.execute("""CREATE TABLE player_stats
                (Player VARCHAR(50),
                Pos VARCHAR(10),
                Age INT,
                Team VARCHAR(10),
                G INT,
                GS INT,
                MP FLOAT,
                FG FLOAT,
                FGA FLOAT,
                FG_PCT FLOAT,
                3P FLOAT,
                3PA FLOAT,
                3P_PCT FLOAT,
                2P FLOAT,
                2PA FLOAT,
                2P_PCT FLOAT,
                EFG_PCT FLOAT,
                FT FLOAT,
                FTA FLOAT,
                FT_PCT FLOAT,
                ORB FLOAT,
                DRB FLOAT,
                TRB FLOAT,
                AST FLOAT,
                STL FLOAT,
                BLK FLOAT,
                TOV FLOAT,
                PF FLOAT,
                PTS FLOAT,
                Year INT)
               """)

Finally we can start adding values into our new table by creating a function that will take in a Pandas Dataframe and then it will insert everything intot the database.

In [34]:
# Create a function to insert data into the table with a parameter of a dataframe
def insert_data(nba_season):
    for index, row in nba_season.iterrows():
        query = """INSERT INTO player_stats(Player, Pos, Age, Team, G, GS, MP, FG, FGA, FG_PCT, 3P, 3PA, 3P_PCT, 2P, 2PA, 2P_PCT, 
                                            EFG_PCT, FT, FTA, FT_PCT, ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PTS, Year)
                                            VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
                                            %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        data = (row['Player'], row['Pos'], row['Age'], row['Team'], row['G'], row['GS'], row['MP'], row['FG'], row['FGA'], row['FG%'], 
                row['3P'], row['3PA'], row['3P%'], row['2P'], row['2PA'], row['2P%'], row['eFG%'], row['FT'], row['FTA'], row['FT%'], row['ORB'], 
                row['DRB'], row['TRB'], row['AST'], row['STL'], row['BLK'], row['TOV'], row['PF'], row['PTS'], row['Year'])
        cursor.execute(query, data)

    # commit the transaction
    database.commit()
    # Consume any remaining unread results
    while cursor.nextset():
        pass


Now to insert the player stats from the the 2020-2021 season into the database.

In [35]:
insert_data(NBA_season_2020_2021)

Test to make sure we were able to correctly insert data into the database by looking at the first 5 rows.

In [36]:
query = """
    SELECT * 
    FROM player_stats
    LIMIT 5
    """

cursor.execute(query)

rows = cursor.fetchall()

for row in rows:
    print(row)

('Stephen Curry', 'PG', 32, 'GSW', 63, 63, 34.2, 10.4, 21.7, 0.482, 5.3, 12.7, 0.421, 5.1, 9.0, 0.569, 0.605, 5.7, 6.3, 0.916, 0.5, 5.0, 5.5, 5.8, 1.2, 0.1, 3.4, 1.9, 32.0, 2021)
('Bradley Beal', 'SG', 27, 'WAS', 60, 60, 35.8, 11.2, 23.0, 0.485, 2.2, 6.2, 0.349, 9.0, 16.8, 0.535, 0.532, 6.8, 7.7, 0.889, 1.2, 3.5, 4.7, 4.4, 1.2, 0.4, 3.1, 2.3, 31.3, 2021)
('Damian Lillard', 'PG', 30, 'POR', 67, 67, 35.8, 9.0, 19.9, 0.451, 4.1, 10.5, 0.391, 4.9, 9.4, 0.519, 0.554, 6.7, 7.2, 0.928, 0.5, 3.7, 4.2, 7.5, 0.9, 0.3, 3.0, 1.5, 28.8, 2021)
('Joel Embiid', 'C', 26, 'PHI', 51, 51, 31.1, 9.0, 17.6, 0.513, 1.1, 3.0, 0.377, 7.9, 14.6, 0.541, 0.545, 9.2, 10.7, 0.859, 2.2, 8.4, 10.6, 2.8, 1.0, 1.4, 3.1, 2.4, 28.5, 2021)
('Giannis Antetokounmpo', 'PF', 26, 'MIL', 61, 61, 33.0, 10.3, 18.0, 0.569, 1.1, 3.6, 0.303, 9.2, 14.4, 0.636, 0.6, 6.5, 9.5, 0.685, 1.6, 9.4, 11.0, 5.9, 1.2, 1.2, 3.4, 2.8, 28.1, 2021)


In [37]:
query = """ 
        SELECT * 
        FROM player_stats
        LIMIT 5
        """

df = pd.read_sql_query(query, database)
print(df)

                  Player Pos  Age Team   G  GS    MP    FG   FGA  FG_PCT  ...  \
0          Stephen Curry  PG   32  GSW  63  63  34.2  10.4  21.7   0.482  ...   
1           Bradley Beal  SG   27  WAS  60  60  35.8  11.2  23.0   0.485  ...   
2         Damian Lillard  PG   30  POR  67  67  35.8   9.0  19.9   0.451  ...   
3            Joel Embiid   C   26  PHI  51  51  31.1   9.0  17.6   0.513  ...   
4  Giannis Antetokounmpo  PF   26  MIL  61  61  33.0  10.3  18.0   0.569  ...   

   ORB  DRB   TRB  AST  STL  BLK  TOV   PF   PTS  Year  
0  0.5  5.0   5.5  5.8  1.2  0.1  3.4  1.9  32.0  2021  
1  1.2  3.5   4.7  4.4  1.2  0.4  3.1  2.3  31.3  2021  
2  0.5  3.7   4.2  7.5  0.9  0.3  3.0  1.5  28.8  2021  
3  2.2  8.4  10.6  2.8  1.0  1.4  3.1  2.4  28.5  2021  
4  1.6  9.4  11.0  5.9  1.2  1.2  3.4  2.8  28.1  2021  

[5 rows x 30 columns]


  df = pd.read_sql_query(query, database)


Now that we can see that it worked we will now be able to gather more data from Basketball reference and store into the database for future analytical work on the data. 

In [38]:
NBA_season_2020_2021

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Stephen Curry,32,GSW,PG,63,63,34.2,10.4,21.7,0.482,...,0.5,5.0,5.5,5.8,1.2,0.1,3.4,1.9,32.0,2021
1,Bradley Beal,27,WAS,SG,60,60,35.8,11.2,23.0,0.485,...,1.2,3.5,4.7,4.4,1.2,0.4,3.1,2.3,31.3,2021
2,Damian Lillard,30,POR,PG,67,67,35.8,9.0,19.9,0.451,...,0.5,3.7,4.2,7.5,0.9,0.3,3.0,1.5,28.8,2021
3,Joel Embiid,26,PHI,C,51,51,31.1,9.0,17.6,0.513,...,2.2,8.4,10.6,2.8,1.0,1.4,3.1,2.4,28.5,2021
4,Giannis Antetokounmpo,26,MIL,PF,61,61,33.0,10.3,18.0,0.569,...,1.6,9.4,11.0,5.9,1.2,1.2,3.4,2.8,28.1,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,Miye Oni,23,UTA,SG,8,0,5.1,0.0,0.6,0.000,...,0.4,0.0,0.4,0.0,0.1,0.0,0.0,1.0,0.0,2021
942,Matt Thomas,26,UTA,SG,3,0,2.3,0.0,0.7,0.000,...,0.0,0.3,0.3,0.0,0.0,0.0,0.0,0.3,0.0,2021
943,Isaac Bonga,21,WAS,SF,4,0,2.5,0.0,1.3,0.000,...,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,2021
944,Anthony Gill,28,WAS,PF,4,0,8.3,0.0,0.8,0.000,...,0.3,0.8,1.0,0.0,0.0,0.0,0.8,1.8,0.0,2021


Let's gather data from the last 14 years of the NBA including the current season.

In [39]:
# Gathering data using the function we created to scrape and clean the data
NBA_season_2023_2024 = scrape_nba_data(2024)
NBA_season_2022_2023 = scrape_nba_data(2023)
NBA_season_2021_2022 = scrape_nba_data(2022)
NBA_season_2020_2021 = scrape_nba_data(2021)
NBA_season_2019_2020 = scrape_nba_data(2020)
NBA_season_2018_2019 = scrape_nba_data(2019)
NBA_season_2017_2018 = scrape_nba_data(2018)
NBA_season_2016_2017 = scrape_nba_data(2017)
NBA_season_2015_2016 = scrape_nba_data(2016)
NBA_season_2014_2015 = scrape_nba_data(2015)
NBA_season_2013_2014 = scrape_nba_data(2014)
NBA_season_2012_2013 = scrape_nba_data(2013)
NBA_season_2011_2012 = scrape_nba_data(2012)
NBA_season_2010_2011 = scrape_nba_data(2011)
NBA_season_2009_2010 = scrape_nba_data(2010)

Now we will insert the data we gathered from the past 14 years of NBA players statistics into the database we created earlier.

In [40]:
# here we are inserting the data into the database with the function we created
insert_data(NBA_season_2023_2024)
insert_data(NBA_season_2022_2023)
insert_data(NBA_season_2021_2022)
insert_data(NBA_season_2019_2020)
insert_data(NBA_season_2018_2019)
insert_data(NBA_season_2017_2018)
insert_data(NBA_season_2016_2017)
insert_data(NBA_season_2015_2016)
insert_data(NBA_season_2014_2015)
insert_data(NBA_season_2013_2014)
insert_data(NBA_season_2012_2013)
insert_data(NBA_season_2011_2012)
insert_data(NBA_season_2010_2011)
insert_data(NBA_season_2009_2010)


In [41]:
# We are going to look at the number of rows in the table to make sure the data was inserted.
query = """ 
        SELECT COUNT(*)
        FROM player_stats
        """

# Execute the query
cursor.execute(query)

# Fetch the results
rows = cursor.fetchone()

print("Number of rows in the table: ", rows[0])


Number of rows in the table:  12920


Now that we know we have the data stored in the database correctly we can now continue with making some visualizatons in order to get some insight from all the data we just got. After, we will move to tableau to create a dashboard to answer some analytical questions. 

The first question that I would like to be answered is if there is any correlation between a player's age and thier performance(like points per game, assists, rebounds, etc)?

First install and import the import libraries needed.

In [42]:
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
sns.set()

In [43]:
# Lets get the oldest player from last season 
old_player = NBA_season_2020_2021[NBA_season_2020_2021['Age'] == NBA_season_2020_2021['Age'].max()]
print(old_player)


            Player  Age Team Pos  G  GS   MP   FG  FGA  FG%  ...  ORB  DRB  \
563  Udonis Haslem   40  MIA   C  1   0  3.0  2.0  2.0  1.0  ...  0.0  1.0   

     TRB  AST  STL  BLK  TOV   PF  PTS  Year  
563  1.0  0.0  0.0  0.0  0.0  0.0  4.0  2021  

[1 rows x 30 columns]


We can't use this as he does not have many games played lets look at someone with at least 20 minutes played to get better results.

In [44]:
# Get the oldest player with at least 20 minutes played per game
old_player = NBA_season_2020_2021[NBA_season_2020_2021['MP'] >= 20.0].sort_values('Age', ascending=False).head(1)
print(old_player)

             Player  Age Team Pos   G  GS    MP   FG  FGA    FG%  ...  ORB  \
531  Andre Iguodala   37  MIA  SF  63   5  21.3  1.5  3.9  0.383  ...  0.6   

     DRB  TRB  AST  STL  BLK  TOV   PF  PTS  Year  
531  2.9  3.5  2.3  0.9  0.6  1.1  1.4  4.4  2021  

[1 rows x 30 columns]


In [45]:
youngest_player = NBA_season_2020_2021[NBA_season_2020_2021['MP'] >= 20.0].sort_values('Age', ascending=True).head(1)
print(youngest_player)

               Player  Age Team Pos   G  GS    MP   FG  FGA    FG%  ...  ORB  \
292  Patrick Williams   19  CHI  PF  71  71  27.9  3.6  7.4  0.483  ...  0.9   

     DRB  TRB  AST  STL  BLK  TOV   PF  PTS  Year  
292  3.7  4.6  1.4  0.9  0.6  1.4  1.8  9.2  2021  

[1 rows x 30 columns]


In [46]:
# First select the stats we want to compare
stats = ['PTS', 'AST', 'TRB']

# Let get the players name
old_player_name = old_player['Player'].values[0]
youngest_player_name = youngest_player['Player'].values[0]

# Get the stats for each player
old_plaer_stats = old_player[stats].values.flatten()
young_player_stats = youngest_player[stats].values.flatten()

#Create the bar chart
fig = go.Figure(data=[
    go.Bar(name=old_player_name, x = stats, y = old_plaer_stats),
    go.Bar(name=youngest_player_name, x = stats, y = young_player_stats)
])

# Group the bar mode
fig.update_layout(barmode='group')

# Show the figure
fig.show()

Just by looking at the graph we can see that a younger player has a better chance of having better stats then someone who is older but we can still see that there is some difference in the amount of time played and games.

We are going to look at the old and youngest player's in that play the same position as each other and compare it that to see if we can see any difference between the two. 

In [47]:
# Get data for youngest and oldest PG
youngest_pg = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'PG') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=True).head(1)
oldest_pg = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'PG') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=False).head(1)

# Get data for the youngest and oldest SG
youngest_sg = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'SG') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=True).head(1)
oldest_sg = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'SG') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=False).head(1)

# Get data for youngest and oldest SF
youngest_sf = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'SF') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=True).head(1)
oldest_sf = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'SF') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=False).head(1)

# Get the data for the youngest and oldest PF
youngest_pf = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'PF') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=True).head(1)
oldest_pf = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'PF') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=False).head(1)

# Get the data for the youngest and oldest C
youngest_c = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'C') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=True).head(1)
oldest_c = NBA_season_2022_2023[(NBA_season_2022_2023['Pos'] == 'C') & (NBA_season_2022_2023['MP'] >= 20.0)].sort_values('Age', ascending=False).head(1)    



In [48]:
# Positions
positions = ['PG', 'SG', 'SF', 'PF', 'C']

# Ages and name of the youngest players
youngest_names = [youngest_pg['Player'].values[0], youngest_sg['Player'].values[0], youngest_sf['Player'].values[0], youngest_pf['Player'].values[0], youngest_c['Player'].values[0]]
youngest_ages = [youngest_pg['Age'].values[0], youngest_sg['Age'].values[0], youngest_sf['Age'].values[0], youngest_pf['Age'].values[0], youngest_c['Age'].values[0]]

# Ages and name of the oldest players
oldest_names = [oldest_pg['Player'].values[0], oldest_sg['Player'].values[0], oldest_sf['Player'].values[0], oldest_pf['Player'].values[0], oldest_c['Player'].values[0]]
oldest_ages = [oldest_pg['Age'].values[0], oldest_sg['Age'].values[0], oldest_sf['Age'].values[0], oldest_pf['Age'].values[0], oldest_c['Age'].values[0]]

# Create the bar chart
fig = go.Figure(data=[
    go.Bar(name='Youngest', x=positions, y=youngest_ages, text=youngest_names, textposition='auto'),
    go.Bar(name='Oldest', x=positions, y=oldest_ages, text=oldest_names, textposition='auto')
])

# Change the bar mode
fig.update_layout(barmode='group')

# Show the figure
fig.show()

Here we just wanted to see the age difference in the oldest player vs the youngest player in each position. Now lets look at how the stats compare for each player. 

In [49]:
# Create a DataFrame for the youngest and oldest players
players_df = pd.DataFrame({
    'Position': ['PG', 'SG', 'SF', 'PF', 'C', 'PG', 'SG', 'SF', 'PF', 'C'],
    'Player': [youngest_pg['Player'].values[0], youngest_sg['Player'].values[0], youngest_sf['Player'].values[0], youngest_pf['Player'].values[0], youngest_c['Player'].values[0], oldest_pg['Player'].values[0], oldest_sg['Player'].values[0], oldest_sf['Player'].values[0], oldest_pf['Player'].values[0], oldest_c['Player'].values[0]],
    'Age': [youngest_pg['Age'].values[0], youngest_sg['Age'].values[0], youngest_sf['Age'].values[0], youngest_pf['Age'].values[0], youngest_c['Age'].values[0], oldest_pg['Age'].values[0], oldest_sg['Age'].values[0], oldest_sf['Age'].values[0], oldest_pf['Age'].values[0], oldest_c['Age'].values[0]],
    'PTS': [youngest_pg['PTS'].values[0], youngest_sg['PTS'].values[0], youngest_sf['PTS'].values[0], youngest_pf['PTS'].values[0], youngest_c['PTS'].values[0], oldest_pg['PTS'].values[0], oldest_sg['PTS'].values[0], oldest_sf['PTS'].values[0], oldest_pf['PTS'].values[0], oldest_c['PTS'].values[0]],
    'AST': [youngest_pg['AST'].values[0], youngest_sg['AST'].values[0], youngest_sf['AST'].values[0], youngest_pf['AST'].values[0], youngest_c['AST'].values[0], oldest_pg['AST'].values[0], oldest_sg['AST'].values[0], oldest_sf['AST'].values[0], oldest_pf['AST'].values[0], oldest_c['AST'].values[0]],
    'TRB': [youngest_pg['TRB'].values[0], youngest_sg['TRB'].values[0], youngest_sf['TRB'].values[0], youngest_pf['TRB'].values[0], youngest_c['TRB'].values[0], oldest_pg['TRB'].values[0], oldest_sg['TRB'].values[0], oldest_sf['TRB'].values[0], oldest_pf['TRB'].values[0], oldest_c['TRB'].values[0]],
    'Type': ['Youngest', 'Youngest', 'Youngest', 'Youngest', 'Youngest', 'Oldest', 'Oldest', 'Oldest', 'Oldest', 'Oldest']
})

# Create a bar chart for the players
fig = go.Figure()

# Add traces for each stat
fig.add_trace(go.Bar(name='Age', x=players_df['Player'], y=players_df['Age'], marker_color=players_df['Type'].map({'Youngest': 'blue', 'Oldest': 'orange'})))
fig.add_trace(go.Bar(name='PTS', x=players_df['Player'], y=players_df['PTS'], marker_color=players_df['Type'].map({'Youngest': 'blue', 'Oldest': 'orange'})))
fig.add_trace(go.Bar(name='AST', x=players_df['Player'], y=players_df['AST'], marker_color=players_df['Type'].map({'Youngest': 'blue', 'Oldest': 'orange'})))
fig.add_trace(go.Bar(name='TRB', x=players_df['Player'], y=players_df['TRB'], marker_color=players_df['Type'].map({'Youngest': 'blue', 'Oldest': 'orange'})))

# Change the bar mode
fig.update_layout(barmode='group', title_text='Stats for Youngest and Oldest Players')

# Show the figure
fig.show()

Just by looking at the graph we can't really see if age difference has an impact of the player stats for each position, as we can see that some older players perform better and some younger players perform better. 

Now we want to gather just data on the top 10 scores from all the years of data that we have scraped and store it in a CSV file that we can imported to tableau and create some interactive dashboards to gain more insights from the data.

In [50]:
nba_season_2009_2010_top10 = NBA_season_2009_2010.sort_values('PTS', ascending=False).head(10)
nba_season_2010_2011_top10 = NBA_season_2010_2011.sort_values('PTS', ascending=False).head(10)  
nba_season_2011_2012_top10 = NBA_season_2011_2012.sort_values('PTS', ascending=False).head(10)
nba_season_2012_2013_top10 = NBA_season_2012_2013.sort_values('PTS', ascending=False).head(10)
nba_season_2013_2014_top10 = NBA_season_2013_2014.sort_values('PTS', ascending=False).head(10)
nba_season_2014_2015_top10 = NBA_season_2014_2015.sort_values('PTS', ascending=False).head(10)
nba_season_2015_2016_top10 = NBA_season_2015_2016.sort_values('PTS', ascending=False).head(10)
nba_season_2016_2017_top10 = NBA_season_2016_2017.sort_values('PTS', ascending=False).head(10)
nba_season_2017_2018_top10 = NBA_season_2017_2018.sort_values('PTS', ascending=False).head(10)
nba_season_2018_2019_top10 = NBA_season_2018_2019.sort_values('PTS', ascending=False).head(10)
nba_season_2019_2020_top10 = NBA_season_2019_2020.sort_values('PTS', ascending=False).head(10)
nba_season_2020_2021_top10 = NBA_season_2020_2021.sort_values('PTS', ascending=False).head(10)
nba_season_2021_2022_top10 = NBA_season_2021_2022.sort_values('PTS', ascending=False).head(10)
nba_season_2022_2023_top10 = NBA_season_2022_2023.sort_values('PTS', ascending=False).head(10)
nba_season_2023_2024_top10 = NBA_season_2023_2024.sort_values('PTS', ascending=False).head(10)


In [51]:
# Now to combine all the top 10 players from each season
all_seasons_top10 = pd.concat([nba_season_2009_2010_top10, nba_season_2010_2011_top10, nba_season_2011_2012_top10, 
                               nba_season_2012_2013_top10, nba_season_2013_2014_top10, nba_season_2014_2015_top10, 
                               nba_season_2015_2016_top10, nba_season_2016_2017_top10, nba_season_2017_2018_top10, 
                               nba_season_2018_2019_top10, nba_season_2019_2020_top10, nba_season_2020_2021_top10, 
                               nba_season_2021_2022_top10, nba_season_2022_2023_top10, nba_season_2023_2024_top10], 
                              ignore_index=True)

Now to create the csv file that we will use in Tableau

In [52]:
all_seasons_top10.to_csv('top10_players.csv', index=False)