In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# Original data is available from season 1946-7
# I decided to collect data from the first season with shot clock
from_year = 1955
to_year = 2018

players = {
 'Season short': [],
 'Season': [],
 'Player': [],
 'Position': [],
 'Height': [],
 'Weight': [],
 'Age': [],
 'Team': [],
 'GP': [],
 'Seasons': [],
 'pre draft team': [],
 'Draft status': [],
 'Nationality': []
}

for year in range(from_year,to_year):
    season_short = str(year)
    season = str(year - 1) + "-" + season_short
    players_url = 'https://basketball.realgm.com/nba/players/{}'.format(season_short)
  
    page = requests.get(players_url)
    soup = BeautifulSoup(page.text, 'html.parser')
    tables = soup.find_all('table')

    rows = tables[0].findAll('tr')[1:]
    for row in rows:
        cols = row.findAll('td')
        players['Season short'].append(season_short)
        players['Season'].append(season)
        players['Player'].append(cols[0].string)
        players['Position'].append(cols[1].string)
        players['Height'].append(cols[2].string)
        players['Weight'].append(cols[3].string)
        players['Age'].append(cols[4].string)        
        players['Team'].append(cols[5].string)
        players['GP'].append(cols[6].string)
        players['Seasons'].append(cols[7].string)
        players['pre draft team'].append(cols[8].string)
        players['Draft status'].append(cols[9].string)
        players['Nationality'].append(cols[10].string)

df_players = pd.DataFrame(players)

In [None]:
seasons_games = {
 'Season short': [],
 'Games': []
}

wiki_seasons_url = 'https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_seasons'

page = requests.get(wiki_seasons_url)
soup = BeautifulSoup(page.text, 'html.parser')
tables = soup.find_all('table')

rows = tables[0].findAll('tr')[2:]
for row in rows:
    cols = row.findAll('td')
    seasons_games['Season short'].append(cols[3].text[:4])
    seasons_games['Games'].append(cols[9].string[-3:-1]) # when there is a range of gaes we take the higher

df_seasons_games = pd.DataFrame(seasons_games)

In [None]:
df_players_combined = pd.merge(df_players, df_seasons_games, on=["Season short"], how='left')
df_players_combined['GP'] = pd.to_numeric(df_players_combined['GP'],errors='coerce')
df_players_combined['Games'] = pd.to_numeric(df_players_combined['Games'],errors='coerce')
df_players_combined['Games missed'] = df_players_combined['Games'] - df_players_combined['GP']

In [None]:
# Height in CM
def parse_height_to_cm(height):
    ht_ = height.split('-')
    feet_ = float(ht_[0])
    inch_ = float(ht_[1])
    return int((12*feet_+ inch_) * 2.54)

df_players_combined['Height [cm]'] = df_players_combined['Height'].apply(lambda x: parse_height_to_cm(x) if (len(x) > 1) else None)

In [None]:
# Weight in KG
df_players_combined['Weight'] = pd.to_numeric(df_players_combined['Weight'],errors='coerce')
df_players_combined['Weight [kg]'] = df_players_combined['Weight'].apply(lambda x: x / 2.2 if (x is not None) else None)

In [None]:
# BMI - body Mass Index
df_players_combined['BMI'] = df_players_combined.apply(
    lambda row: row['Weight [kg]']/((row['Height [cm]']**2)/10000), axis=1)

In [None]:
df_players_combined.to_csv('NBA_players_history.csv', index=False)