**Scrape basketball-reference.com for Player Data**

In [30]:
# import needed libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [32]:
# Create an empty list for player data from all players from all states
all_states_data = []
# These columns will be used label the dataframe columns
columns = ['url', 'name', 'years', 'gp', 'mp', 'fg', 'fga', '3p', '3pa', 'ft', 'fta', 
           'orb', 'trb', 'ast', 'stl', 'blk', 'to', 'pf', 'pts', 'birthday', 'city', 'state']
# These are the data stat on the website that have the same format
data_stats = ['years', 'g', 'mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 
              'orb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'birth_date', 'birth_city']

In [33]:
# In order to successfully compile the database, we must in one list of states at a time
# Alternate between commenting in/out each states list while reinitialize the variable 
# And running the for state in states loop
# Otherwise we risk a 429 Error for too many requests

states = ['AL']
# states = ['HI','ID','IL','IN','IA','KS','KY','LA','ME','MD']
# states = ['MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ']
# states =['NM','NY','NC','ND','OH','OK','OR','PA','RI','SC']
# states = ['SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

print(f'Ready to scrape the following states: {states}')

Ready to scrape the following states: ['AL']


In [40]:
# This loop iterates through the selected list of states
for state in states:
        
        url = f'https://www.basketball-reference.com/friv/birthplaces.fcgi?country=US&state={state}'
        html = urlopen(url)
        soup = BeautifulSoup(html, features='lxml')
        
        tbody = soup('tbody')
        all_rows = soup('tr')

        for row in all_rows[2:]:
                print(row)
                # Remove header breaks in website table
                for tr in row.select('.thead'):
                      tr.decompose()

                # data_rows = row.findAll('tr')
                
                # for data_row in data_rows:
                    # Create a list for each player's data
                player_data = []
                # The player name is in a hyperlink, so we must go into an a tag to get it
                name = row.find('a').text
                # Find the href extension for the player
                player_url = row.find('a').get('href')
                # Concatenate the base url for the website and the player extension
                total_url = f'https://www.basketball-reference.com{player_url}'
                # Each .append() adds player data to the list
                player_data.append(total_url) 
                player_data.append(name)
                # This for loop allows us to gather many data-stat values
                for stat in data_stats:
                    # This grabs the current data-stat from the list
                    data = row.find('td', attrs={'data-stat': stat}).text
                    player_data.append(data) 
                player_data.append(state)
                # This appends the players data to the all_states_data list
                all_states_data.append(player_data)
        print(f'Player data from {state} successfully scraped')

print('Done compiling, go to next list if needed')       

<tr><th class="right" csk="1" data-stat="ranker" scope="row">1</th><td class="left" csk="Ansley,Michael" data-append-csv="anslemi01" data-stat="player"><a href="/players/a/anslemi01.html">Michael Ansley</a></td><td class="right" data-stat="years">3</td><td class="right" data-stat="year_min">1990</td><td class="right" data-stat="year_max">1992</td><td class="right" data-stat="g">149</td><td class="right" data-stat="mp">2143</td><td class="right" data-stat="fg">383</td><td class="right" data-stat="fga">746</td><td class="right iz" data-stat="fg3">0</td><td class="right iz" data-stat="fg3a">0</td><td class="right" data-stat="ft">260</td><td class="right" data-stat="fta">360</td><td class="right" data-stat="orb">311</td><td class="right" data-stat="trb">621</td><td class="right" data-stat="ast">67</td><td class="right" data-stat="stl">51</td><td class="right" data-stat="blk">24</td><td class="right" data-stat="tov">85</td><td class="right" data-stat="pf">284</td><td class="right" data-stat

AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
# Create Dataframe
all_states_df = pd.DataFrame(all_states_data)
all_states_df.columns = columns
all_states_df

In [None]:
# Check datatypes
all_states_df.dtypes

In [None]:
# Convert data strings to numbers
cols = ['years', 'gp', 'mp', 'fg', 'fga', '3p', '3pa', 'ft', 'fta', 
           'orb', 'trb', 'ast', 'stl', 'blk', 'to', 'pf', 'pts']
all_states_df[cols] = all_states_df[cols].apply(pd.to_numeric, errors='coerce')
all_states_df.dtypes

In [None]:
# Create .json file to store data
out = all_states_df.to_json(orient='records')

with open('all_states.json', 'w') as f:
    f.write(out)