**Scrape basketball-reference.com for Player Data**

In [None]:
# import needed libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# Create an empty list for player data from all players from all states
all_states_data = []
# These columns will be used label the dataframe columns
columns = ['Player_url', 'Player_name', 'Years', 'GP', 'MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 
           'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'Birthday', 'City', 'State']
# These are the data stat on the website that have the same format
data_stats = ['years', 'g', 'mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 
              'orb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'birth_date', 'birth_city']

In [None]:
# In order to successfully compile the database, we must in one list of states at a time
# Alternate between commenting in/out each states list while reinitialize the variable 
# And running the for state in states loop
# Otherwise we risk a 429 Error for too many request

states = ['AL','AK','AZ','AR','CA','CO','CT','DC','DE','FL','GA']
# states = ['HI','ID','IL','IN','IA','KS','KY','LA','ME','MD']
# states = ['MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ']
# states =['NM','NY','NC','ND','OH','OK','OR','PA','RI','SC']
# states = ['SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

print(f'Ready to scrape the following states: {states}')

In [None]:
for state in states:
        
        url = f'https://www.basketball-reference.com/friv/birthplaces.fcgi?country=US&state={state}'
        html = urlopen(url)
        soup = BeautifulSoup(html, features='lxml')
        
        all_rows = soup.findAll('tbody')

        for row in all_rows:
                # Remove header breaks in website table
                for tr in row.select('.thead'):
                      tr.decompose()

                data_rows = row.findAll('tr')
                
                for data_row in data_rows:
                    # Create a list for each player's data
                    player_data = []
                    # The player name is in a hyperlink, so we must go into an a tag to get it
                    name = data_row.find('a').text
                    # Find the href extension for the player
                    player_url = data_row.find('a').get('href')
                    # Concatenate the base url for the website and the player extension
                    total_url = f'https://www.basketball-reference.com{player_url}'
                    # Each .append() adds player data to the list
                    player_data.append(total_url) 
                    player_data.append(name)
                    # This for loop allows us to gather many data-stat values
                    for stat in data_stats:
                        # This grabs the current data-stat from the list
                        data = data_row.find('td', attrs={'data-stat': stat}).text
                        player_data.append(data) 
                    player_data.append(state)
                    # This appends the players data to the all_states_data list
                    all_states_data.append(player_data)
        print(f'Player data from {state} successfully scraped')

print('Done compiling, go to next list if needed')       

In [None]:
print(all_states_data)

In [None]:
all_states_df = pd.DataFrame(all_states_data)

In [None]:
all_states_df.columns = columns
all_states_df

In [None]:
out = all_states_df.to_json(orient='records')

with open('all_states.json', 'w') as f:
    f.write(out)