In [None]:
# IMport Dependencies
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import requests
from api_key import api_key

In [None]:
# Create an empty list for player data from all players from all states
all_states_data = []
# These columns will be used label the dataframe columns
columns = ['url', 'name', 'years', 'gp', 'mp', 'fg', 'fga', 'threep', 'threepa', 'ft', 'fta', 
           'orb', 'trb', 'ast', 'stl', 'blk', 'to', 'pf', 'pts', 'birthday', 'birthyear', 'city', 'state', 'lat', 'lon']
# These are the data stat on the website that have the same format
data_stats = ['years', 'g', 'mp', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 
              'orb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'birth_date']

states = ['AL','AK','AR','AZ','CA','CO','CT','DC','DE','FL','GA',
          'HI','ID','IL','IN','IA','KS','KY','LA','ME','MD',
          'MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ',
          'NM','NY','NC','ND','OH','OK','OR','PA','RI','SC',
          'SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

In [None]:
# This loop iterates through the selected list of states
for state in states:
        
        url = f'https://www.basketball-reference.com/friv/birthplaces.fcgi?country=US&state={state}'
        html = urlopen(url)
        soup = BeautifulSoup(html, features='lxml')
        
        tbody = soup('tbody')
        

        for t in tbody:
                # Remove header breaks in website table
                for tr in t.select('.thead'):
                      tr.decompose()

                rows = t.findAll('tr')
                
                for row in rows:
                    # Create a list for each player's data
                    player_data = []
                    # The player name is in a hyperlink, so we must go into an a tag to get it
                    name = row.find('a').text
                    city = row.find('td', attrs={'data-stat': 'birth_city'}).text.replace(" ", "_")
                     # Url for API request
                    target_url = f"https://api.geoapify.com/v1/geocode/search?city={city}&state={state}&format=json&apiKey={api_key}"

                    # Make API request to get latitude and longitude
                    response = requests.get(target_url)
                    data = response.json()

                    # Extract latitude and longitude from the API response
                    if 'results' in data and data['results']:
                        lat = data['results'][0]['lat']
                        lon = data['results'][0]['lon']
                        #print(f'Coordinates for {city},{state}: lat: {lat}, lon: {lon}')

                    # Find the href extension for the player
                    player_url = row.find('a').get('href')
                    # Concatenate the base url for the website and the player extension
                    total_url = f'https://www.basketball-reference.com{player_url}'
                    # This gets rid of the underscores added to cities with spaces
                    city = row.find('td', attrs={'data-stat': 'birth_city'}).text.replace("_", " ")
                    # The website gives us the birthday as a string
                    birth_date = row.find('td', attrs={'data-stat': 'birth_date'}).text
                    # This takes the year from the end of the string and makes it an integer
                    # The or
                    birth_year = int(birth_date[-4:] or 0)
                    # Each .append() adds player data to the list
                    player_data.append(total_url) 
                    player_data.append(name)
                    # This for loop allows us to gather many data-stat values
                    for stat in data_stats:
                        # This grabs the current data-stat from the list
                        data = row.find('td', attrs={'data-stat': stat}).text
                        player_data.append(data)
                    player_data.append(birth_year) 
                    player_data.append(city)
                    player_data.append(state)
                    player_data.append(lat)
                    player_data.append(lon)
                    # This appends the players data to the all_states_data list
                    all_states_data.append(player_data)
        print(f'Player data from {state} successfully scraped')

print('Done compiling!')  

In [None]:
print(all_states_data)

In [None]:
# Create Dataframe
all_states_df = pd.DataFrame(all_states_data)
all_states_df.columns = columns
all_states_df

In [None]:
# Check datatypes
all_states_df.dtypes

In [None]:
# Convert data strings to numbers
cols = ['years', 'gp', 'mp', 'fg', 'fga', 'threep', 'threepa', 'ft', 'fta', 
           'orb', 'trb', 'ast', 'stl', 'blk', 'to', 'pf', 'pts']
all_states_df[cols] = all_states_df[cols].apply(pd.to_numeric, errors='coerce')
all_states_df.dtypes

In [None]:
##all_states_df = all_states_df.replace(to_replace="NaN",value=0)
##all_states_df = all_states_df.replace(to_replace="null",value=0)
all_states_df = all_states_df.fillna(0)


In [None]:
all_states_df

In [None]:
# Create .json file to store data
out = all_states_df.to_json(orient='records')

with open('full_states_nba.json', 'w') as f:
    f.write(out)