# Webscrape all NBA players and team history

### Set-up

In [12]:
# Import packages
import pandas as pd
import numpy as np
import string
from bs4 import BeautifulSoup
import requests
import time
import os
import sys

# Directory
WORKING_DIRECTORY = "/Users/peterchristenson/Desktop/Projects/NBA Champions Exploratory" 

### Get links to player profiles for each player

In [2]:
# Pages with player names use first letter of last names
alphabet = list(string.ascii_lowercase)
player_name_link_df = pd.DataFrame()

# Loop through alphabet, get names and links for all players
for letter in alphabet:
    # Request URL, call BeautifulSoup
    webpage = "https://en.hispanosnba.com/players/nba-all/" + letter
    response = requests.get(webpage)
    if response.status_code == 404: # Some letters may not have pages, skip them
        continue 
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get relevant part of webpage
    players = soup.find_all("table", {"class": "tblprm"})[0]
    player_name_links = players.find_all("a")

    # Loop through and add names, links
    for player in player_name_links:
        player_title = player["title"]
        player_link = player["href"]
        df = pd.DataFrame({'name': [player_title], 'link': [player_link]})
        player_name_link_df = pd.concat([player_name_link_df, df])
            
    # Set time buffer so as to not trigger any issues visiting website
    time.sleep(1)

In [15]:
# Preview dataframe
player_name_link_df = player_name_link_df.reset_index(drop=True)
player_name_link_df['index'] = player_name_link_df.reset_index().index + 1
player_name_link_df

Unnamed: 0,name,link,index
0,Alaa Abdelnaby,/players/alaa-abdelnaby,1
1,Zaid Abdul-Aziz,/players/zaid-abdul-aziz,2
2,Kareem Abdul-Jabbar,/players/kareem-abdul-jabbar,3
3,Mahmoud Abdul-Rauf,/players/mahmoud-abdul-rauf,4
4,Tariq Abdul-Wahad,/players/tariq-abdul-wahad,5
...,...,...,...
5000,Jim Zoet,/players/jim-zoet,5001
5001,Bill Zopf,/players/bill-zopf,5002
5002,Brian Zoubek,/players/brian-zoubek,5003
5003,Ivica Zubac,/players/ivica-zubac,5004


### Get player team information

In [None]:
player_team_df = pd.DataFrame()
nrows_player_name_link_df = player_name_link_df.shape[0]
for i in range(nrows_player_name_link_df):
    # Get player info
    player_page_link = player_name_link_df.loc[i]["link"]
    player_name = player_name_link_df.loc[i]["name"]
    player_index = player_name_link_df.loc[i]["index"]
    try:        
        # BeautifulSoup
        response = requests.get("https://en.hispanosnba.com" + player_page_link + "/stats")
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get teams played for and years played
        table = soup.find("table", {"class":"tbljug"}).find_all("th", {"scope":"row"})
        seasons_team = table[0].find_all("a")
        teams = [i for i in seasons_team if "teams" in i["href"]]
        teams = [i["title"] for i in teams]
        
        # Create dataframe
        df = pd.DataFrame({'team_year':teams})
        df['index'] = player_index
        df['name'] = player_name
        df['status'] = "OK"
        player_team_df = pd.concat([player_team_df, df])
    except:
        df = pd.DataFrame({'index':[player_index], 'name':[player_name], 'team_year':[None], 'status':["ERROR"]})
        player_team_df = pd.concat([player_team_df, df])
        
    time.sleep(1)

In [17]:
# Preview dataframe
player_team_df = player_team_df.reset_index(drop=True)
player_team_df = player_team_df[["index", "name", "status", "team_year"]]
player_team_df

Unnamed: 0,index,name,status,team_year
0,1,Alaa Abdelnaby,OK,Portland Trail Blazers 1990-91
1,1,Alaa Abdelnaby,OK,Portland Trail Blazers 1991-92
2,1,Alaa Abdelnaby,OK,Milwaukee Bucks 1992-93
3,1,Alaa Abdelnaby,OK,Boston Celtics 1992-93
4,1,Alaa Abdelnaby,OK,Boston Celtics 1993-94
...,...,...,...,...
26924,5004,Ivica Zubac,OK,Los Angeles Clippers 2019-20
26925,5004,Ivica Zubac,OK,Los Angeles Clippers 2020-21
26926,5004,Ivica Zubac,OK,Los Angeles Clippers 2021-22
26927,5004,Ivica Zubac,OK,Los Angeles Clippers 2022-23


In [19]:
# Check status
player_team_df.groupby("status")["status"].count()

status
ERROR      153
OK       26776
Name: status, dtype: int64

In [21]:
# Errors
error_players = player_team_df[player_team_df["status"] == "ERROR"]
error_players
# Assume all are players that never played

Unnamed: 0,index,name,status,team_year
134,22,Josh Adams,ERROR,
315,57,Bryce Alford,ERROR,
438,81,Alade Aminu,ERROR,
881,149,Brandon Ashley,ERROR,
929,156,Darion Atkins,ERROR,
...,...,...,...,...
26806,4973,Mike Young,ERROR,
26820,4975,Patric Young,ERROR,
26855,4983,Rade Zagorac,ERROR,
26871,4986,Nick Zeisloft,ERROR,


In [22]:
# Export
player_team_df.to_csv(WORKING_DIRECTORY + "/intermediate/players_seasons_teams.csv")