In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [19]:
def nba_players_stats(initial_season, final_season):
    stats_list = []
    for season in range(initial_season, final_season):
        request = request_data_from_season(season)
        soup = parse_request(request)
        table_body = find_table_body(soup)
        players_data = season_total(table_body)
        for player in players_data:
            player_stats = individual_stats(player)
            player_id = primary_key(player)
            player_stats.append(season)
            player_stats.append(player_id)
            stats_list.append(player_stats)
    table_header = find_table_header(soup)
    columns = generate_df_columns(table_header)
    df = pd.DataFrame(stats_list, columns=columns)
    df = clean_nba_stats_df(df)
    return df


def request_data_from_season(season):
    request = requests.get(f'https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html')
    return request


def parse_request(request):
    soup = BeautifulSoup(request.text, features="html.parser")
    return soup


def find_table_body(soup):
    table = soup.find('table')
    body = table.find('tbody')
    return body


def find_table_header(soup):
    table = soup.find('table')
    header = table.find('thead')
    return header


def generate_df_columns(table_header):
    columns = [i.text for i in table_header.find_all('th')]
    columns = columns[1:]
    columns.append('Season')
    columns.append('Id')
    return columns


def season_total(table_body):
    players_data = table_body.find_all('tr', attrs={'class': 'full_table'})
    return players_data


def primary_key(player):
    cell = player.find('td', attrs={'data-stat':'player'})
    link = cell.find('a')['href']
    id_value = link.split('/')[3].split('.')[0]
    return id_value


def individual_stats(player):
    player_stats = [x.text for x in player.find_all('td')]
    return player_stats


def clean_nba_stats_df(df):
    df = df.drop('GS', axis=1)  # a lot of missing information on 'GS' column
    df['Player'] = df['Player'].str.strip('*')  # All-Star players had a * in their names
    df.iloc[:, 6:-2] = df.iloc[:, 6:-2].replace('', np.nan).replace(np.nan, 0).astype('float64')  # Game stats to float
    df[['Age', 'G', 'Season']] = df[['Age', 'G', 'Season']].replace('', 0).astype('int64')  # Non-game stats to int
    cols = list(df.columns)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    return df


In [20]:
stats = nba_players_stats(1980, 2022)
stats

Unnamed: 0,Id,Player,Pos,Age,Tm,G,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,abdulka01,Kareem Abdul-Jabbar,C,32,LAL,82,38.3,10.2,16.9,0.604,...,2.3,8.5,10.8,4.5,1.0,3.4,3.6,2.6,24.8,1980
1,abernto01,Tom Abernethy,PF,25,GSW,67,18.2,2.3,4.7,0.481,...,0.9,1.9,2.9,1.3,0.5,0.2,0.6,1.8,5.4,1980
2,adamsal01,Alvan Adams,C,25,PHO,75,28.9,6.2,11.7,0.531,...,2.1,6.0,8.1,4.3,1.4,0.7,2.9,3.2,14.9,1980
3,architi01,Tiny Archibald,PG,31,BOS,80,35.8,4.8,9.9,0.482,...,0.7,1.7,2.5,8.4,1.3,0.1,3.0,2.7,14.1,1980
4,awtrede01,Dennis Awtrey,C,31,CHI,26,21.5,1.0,2.3,0.450,...,1.1,3.3,4.4,1.5,0.5,0.6,1.0,2.5,3.3,1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17678,wrighde01,Delon Wright,SG-PG,28,TOT,63,27.7,3.8,8.2,0.463,...,1.0,3.2,4.3,4.4,1.6,0.5,1.3,1.2,10.2,2021
17679,youngth01,Thaddeus Young,PF,32,CHI,68,24.3,5.4,9.7,0.559,...,2.5,3.8,6.2,4.3,1.1,0.6,2.0,2.2,12.1,2021
17680,youngtr01,Trae Young,PG,22,ATL,63,33.7,7.7,17.7,0.438,...,0.6,3.3,3.9,9.4,0.8,0.2,4.1,1.8,25.3,2021
17681,zelleco01,Cody Zeller,C,28,CHO,48,20.9,3.8,6.8,0.559,...,2.5,4.4,6.8,1.8,0.6,0.4,1.1,2.5,9.4,2021


In [24]:
stats.to_csv('players_game_stats.csv', index=False)

In [12]:
initials = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']

In [13]:
data = []
for i in initials:
    request = requests.get(f'https://www.basketball-reference.com/players/{i}/')
    soup = BeautifulSoup(request.text, features='html.parser')
    table = soup.find('table')
    body = table.find('tbody')
    trs = body.find_all('tr')
    ths = [x.find('th') for x in trs]
    links = [y.find('a')['href'] for y in ths]
    for l in links:
        request = requests.get('https://www.basketball-reference.com' + l)
        soup = BeautifulSoup(request.text, features='html.parser')
        meta = soup.find('div', attrs={'id':'meta'})
        name = meta.find('h1').text.strip('\n')
        infos = [i.text for i in meta.find_all('p')]
        matching = [i.replace(u'\xa0', u'') for i in infos if ('cm' in i) and ('kg' in i)]
        try:
            cm_kg = [x.split('(') for x in matching][0][1].strip(') ').split(',')
        except(IndexError):
            pass
        else:
            id_value = l.split('/')[3].split('.')[0]
            cm = cm_kg[0].strip('cm')
            kg = cm_kg[1].strip('kg')
            data.append([id_value, name, cm, kg])

In [22]:
height_weight = pd.DataFrame(data, columns=['Id', 'Player', 'Height', 'Weight'])
height_weight

Unnamed: 0,Id,Player,Height,Weight
0,abdelal01,Alaa Abdelnaby,208,108
1,abdulza01,Zaid Abdul-Aziz,206,106
2,abdulka01,Kareem Abdul-Jabbar,218,102
3,abdulma02,Mahmoud Abdul-Rauf,185,73
4,abdulta01,Tariq Abdul-Wahad,198,101
...,...,...,...,...
5013,zizican01,Ante Žižić,208,120
5014,zoetji01,Jim Zoet,216,108
5015,zopfbi01,Bill Zopf,185,77
5016,zubaciv01,Ivica Zubac,213,108


In [25]:
height_weight.to_csv('players_height_weight.csv', index=False)