### Coding 2 Final Project
## basketball-reference.com
### Nicolas Fernandez
The task is to scrape the past 3 full seasons (2020-21, 2021-22, 2022-23) for the top 70 point per game scorers in the current NBA season (https://www.basketball-reference.com/leagues/NBA_2024_per_game.html)

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from scrapethat import *

In [2]:
# Extracting required links from basketball reference
t = read_html('https://www.basketball-reference.com/leagues/NBA_2024_per_game.html')

# Getting player partial links for full link construction later
player_partial_links = [x.strip('.html') for x in get_links(t.select('th+ .left a'))]

# Getting PPG for sorting top 70 players, converting to float
player_ppg = [float(x) for x in get_texts(t.select('.right:nth-child(30)'))]

# Creating dictionary from both lists
player_dict = dict(zip(player_partial_links, player_ppg))

# Sorting for top 70 scorers in current season
top_70_scorers = dict(sorted(player_dict.items(), key=lambda x:x[1], reverse=True)[:70])

# Getting player links for top 70 players
top_70_links = [x for x in top_70_scorers.keys()]

# Construct links for all three seasons for each player in list
all_season_links = []
for player in top_70_links:
    all_season_links.extend(f'https://www.basketball-reference.com{player}/gamelog/202{x}' for x in range(1, 4))

In [3]:
# Creating function to pull all required stats from a link and return a df
def season_game_logs(url):
    
    print(url)
    
    # Get html from link
    t = read_html(url)
    
    # Inputting season column based on url
    season = url[-4:]

    # Player name
    name = t.find('h1').get_text().strip().split(' 2')[0]
    
    # Player height
    height = get_texts(t.select('p span:nth-child(1)'))[0]
    
    # Player weight in lbs
    weight = get_texts(t.select('span+ span:nth-child(2)'))[1].replace('lb', '')

    # Game numbers including games not played
    game = get_texts(t.select('th.right+ .right'))

    # Game dates
    game_date = get_texts(t.select('.right.iz+ .left a , .tooltip+ .left'))
    
    # Player age on the specific date, yaer shown only
    age = get_texts(t.select('.left+ .right'))[:-1]
    age = [x.split('-')[0] for x in age]

    # Minutes played
    minutes = get_texts(t.select('.center+ td.center , .right:nth-child(10)'))

    # Field goals made
    fgm = get_texts(t.select('.center+ td.center , .right:nth-child(11)'))

    # Field goals attempted
    fga = get_texts(t.select('.center+ td.center , .right:nth-child(12)'))

    # Field goal percentage
    fg_percent = get_texts(t.select('.center+ td.center , .right:nth-child(13)'))

    # Three pointers made
    three_pm = get_texts(t.select('.center+ td.center , .right:nth-child(14)'))

    # Three pointers attempted
    three_pa = get_texts(t.select('.center+ td.center , .right:nth-child(15)'))

    # Three pointer percentage
    three_p_percent = get_texts(t.select('.center+ td.center , .right:nth-child(16)'))

    # Free throws made
    ftm = get_texts(t.select('.center+ td.center , .right:nth-child(17)'))

    # Free throws attempted
    fta = get_texts(t.select('.center+ td.center , .right:nth-child(18)'))

    # Free throw percentage
    ft_percent = get_texts(t.select('.center+ td.center , .right:nth-child(19)'))

    # OFfensive rebounds
    orb = get_texts(t.select('.center+ td.center , .right:nth-child(20)'))

    # Defensive rebounds
    drb = get_texts(t.select('.center+ td.center , .right:nth-child(21)'))
    drb = [x for x in drb if x != '+/-']

    # Total rebounds
    trb = get_texts(t.select('.center+ td.center , .right:nth-child(22)'))

    # Assists
    ast = get_texts(t.select('.center+ td.center , .right:nth-child(23)'))

    # Steals
    stl = get_texts(t.select('.center+ td.center , .right:nth-child(24)'))

    # Blocks
    blk = get_texts(t.select('.center+ td.center , .right:nth-child(25)'))

    # Turnovers
    tov = get_texts(t.select('.center+ td.center , .right:nth-child(26)'))

    # Personal fouls
    pf = get_texts(t.select('.center+ td.center , .right:nth-child(27)'))

    # Points
    pts = get_texts(t.select('.center+ td.center , .right:nth-child(28)'))

    # Plus-minus
    plus_minus = get_texts(t.select('.center+ td.center , .right:nth-child(30)'))
    plus_minus = [x for x in plus_minus if x != '+/-'] # got rid of unwanted columns

    # Creating a dataframe out of our scraped info
    df = pd.DataFrame({'season': season, 'name': name, 'height': height, 'weight': weight, 'game': game, 
                       'game_date': game_date, 'age': age, 'min': minutes, 'fgm': fgm, 'fga': fga, 
                       'fg_percent': fg_percent,'three_pm': three_pm, 'three_pa': three_pa,
                       'three_percent': three_p_percent, 'ftm': ftm, 'fta': fta,'ft_percent': ft_percent,
                       'orb': orb, 'drb': drb, 'trb': trb, 'ast': ast, 'stl': stl, 'blk': blk, 'tov': tov, 
                       'pf': pf, 'pts': pts, 'plus_minus': plus_minus})
    
    # Returns dataframe
    return df

In [4]:
# Mapping function to create large dataframe
top_70_df = list(map(season_game_logs, all_season_links))

https://www.basketball-reference.com/players/e/embiijo01/gamelog/2021
https://www.basketball-reference.com/players/e/embiijo01/gamelog/2022
https://www.basketball-reference.com/players/e/embiijo01/gamelog/2023
https://www.basketball-reference.com/players/d/doncilu01/gamelog/2021
https://www.basketball-reference.com/players/d/doncilu01/gamelog/2022
https://www.basketball-reference.com/players/d/doncilu01/gamelog/2023
https://www.basketball-reference.com/players/a/antetgi01/gamelog/2021
https://www.basketball-reference.com/players/a/antetgi01/gamelog/2022
https://www.basketball-reference.com/players/a/antetgi01/gamelog/2023
https://www.basketball-reference.com/players/g/gilgesh01/gamelog/2021
https://www.basketball-reference.com/players/g/gilgesh01/gamelog/2022
https://www.basketball-reference.com/players/g/gilgesh01/gamelog/2023
https://www.basketball-reference.com/players/d/duranke01/gamelog/2021
https://www.basketball-reference.com/players/d/duranke01/gamelog/2022
https://www.basketba

In [5]:
# Concatenating all the dataframes together to display all dataframes in list
nba_top_scorers_df = pd.concat(top_70_df, axis='rows').reset_index(drop=True)

# Converting df to csv
nba_top_scorers_df.to_csv('top_scorers_data.csv', index=False)