In [1]:
# pandas for data manipulation and analysis
import pandas as pd
# numpy to manipulate multi-dimensional arrays and matrices
import numpy as np
# re - regex to provides regular expression matching operations
import re
# the OS module in Python provides a way of using operating system dependent functionality
import os
# the requests module allows you to send HTTP requests using Python
import requests
# python-dotenv to read the key-value pair from . env file and adds them to environment variable.
from dotenv import load_dotenv
#Beautiful Soup is a Python library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup
# The selenium package is used to automate web browser interaction from Python
from selenium import webdriver
# to simplify management of binary drivers for different browsers
#import chromedriver_binary 
from webdriver_manager.chrome import ChromeDriverManager

# Import Dataset from Kaggle

In [2]:
df = pd.read_csv('inputs/datasets_Nba_Players.csv', encoding = 'latin-1')

In [3]:
# Correct the name of a column
df = df.rename({'collage':'college'}, axis=1)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Player,height,weight,college,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky


In [5]:
# Generate descriptive statistics.
#Descriptive statistics include those that summarize the central tendency, 
#dispersion and shape of a dataset’s distribution, excluding NaN values.

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,3922.0,1960.5,1132.328206,0.0,980.25,1960.5,2940.75,3921.0
height,3921.0,198.704922,9.269761,160.0,190.0,198.0,206.0,231.0
weight,3921.0,94.783219,12.039515,60.0,86.0,95.0,102.0,163.0
born,3921.0,1962.37975,20.33491,1913.0,1948.0,1964.0,1979.0,1997.0


In [6]:
# Return a tuple representing the dimensionality of the DataFrame
df.shape

(3922, 8)

In [7]:
# The column labels of the DataFrame
df.columns

Index(['Unnamed: 0', 'Player', 'height', 'weight', 'college', 'born',
       'birth_city', 'birth_state'],
      dtype='object')

In [8]:
# Detect missing values. Return a boolean output
df.isnull().sum().sort_values(ascending=False).head(5)

birth_state    483
birth_city     470
college        349
born             1
weight           1
dtype: int64

In [9]:
# Remove row where all values are None- No hace falta
#df = df.dropna(how='all')

# API
https://probasketballapi.com/docs/v2/teams

In [10]:
load_dotenv()

True

In [11]:
apikey = os.getenv("APIKEY")

In [12]:
#print(apikey)

In [13]:
# enviar una request. Si sale un numero que empieza con 2 todo bien, 
# o si no ver los errores de los gatos a ver que dicen!
url = 'http://api.probasketballapi.com/player'
res = requests.post(url)
res

<Response [200]>

In [14]:
def player_name():
    first_name = input('Insert the name of an NBA player')
    last_name = input('Insert the last_name of an NBA player')
    return first_name, last_name

def call_player(player_name):
    try:
        query = {'api_key': f'{apikey}', 'first_name':f'{player_name[0]}', 'last_name':f'{player_name[1]}'} 
        r = requests.post(url, data=query)
        player_api =r.json()[0]
        table_api = pd.DataFrame(player_api, index=[0])
        return table_api.head()
    except:
        print('Player not in database. Try another one')
        return call_player()
player1_name = player_name()
player2_name = player_name()
player1_table = call_player(player1_name)
player2_table = call_player(player2_name)

Insert the name of an NBA playerLeBron
Insert the last_name of an NBA playerJames
Insert the name of an NBA playerJames
Insert the last_name of an NBA playerHarden


In [15]:
display(player1_table)
display(player2_table)

Unnamed: 0,id,team_id,player_name,first_name,last_name,birth_date,position,dk_position,dk_id
0,2544,1610612739,LeBron James,LeBron,James,1984-12-30,Forward,SF,214152


Unnamed: 0,id,team_id,player_name,first_name,last_name,birth_date,position,dk_position,dk_id
0,201935,1610612745,James Harden,James,Harden,1989-08-26,Guard,PG,395388


### With the API they only get some more information about the players, such as the position on the pitch, and the complete date of birth.
With different queries, however, *I can get information on the player's performance in a specific game.*
The object of the analysis, however, is to compare the stats of two players, so **let's try to get this data through selenium**.

# Merge original dataset with API infos

In [18]:
def merge_info_api(player_name, player_table):
    query_player = df[(df['Player']==f'{player_name[0]} {player_name[1]}')]
    # Rename the column with the common data
    player_table = player_table.rename({'player_name':'Player'}, axis=1)
    # Merge of the two tables
    query_player_update = pd.merge(query_player, player_table, on='Player', how='outer')
    # Eliminate redundant information
    query_player_update = query_player_update[['Unnamed: 0', 'Player', 'first_name', 'last_name', 
    'born', 'birth_date','birth_city', 'college','height', 'weight','position', 'dk_position']]
    # Result compared to the original data
    display(query_player)
    display(query_player_update)
    
player1_table_api = merge_info_api(player1_name, player1_table)
player2_table_api = merge_info_api(player2_name, player2_table)

Unnamed: 0.1,Unnamed: 0,Player,height,weight,college,born,birth_city,birth_state
2944,2944,LeBron James,203.0,113.0,,1984.0,Akron,Ohio


Unnamed: 0.1,Unnamed: 0,Player,first_name,last_name,born,birth_date,birth_city,college,height,weight,position,dk_position
0,2944,LeBron James,LeBron,James,1984.0,1984-12-30,Akron,,203.0,113.0,Forward,SF


Unnamed: 0.1,Unnamed: 0,Player,height,weight,college,born,birth_city,birth_state
3357,3357,James Harden,196.0,99.0,Arizona State University,1989.0,Los Angeles,California


Unnamed: 0.1,Unnamed: 0,Player,first_name,last_name,born,birth_date,birth_city,college,height,weight,position,dk_position
0,3357,James Harden,James,Harden,1989.0,1989-08-26,Los Angeles,Arizona State University,196.0,99.0,Guard,PG


# SELENIUM AND WEB SCRAPING

In [None]:
def getPage(url):
    # This function show a basic way to make a get requests used Selenium
    # First line to to avoid this error:
    # WebDriverException: Message: 'chromedriver' executable needs to be available in the path.
    # driver.execute_script(..) to scroll the page in Selenium untill the bottom
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    html = driver.page_source
    soup = BeautifulSoup(html)
    driver.quit()
    return soup

In [None]:
# Example with Lebron James stats
soup = getPage("https://stats.nba.com/player/2544/")

In [None]:
#soup

In [None]:
# different way to scraping the webpabge, using CSS selector or tags
# Ex.1 - just columns name
columnas = [table.text.split() for table in soup.select("thead")]
# Ex.2 - all value in the table
lenght = [table.text.split() for table in soup.select("tbody")]
row = [table.text.split() for table in soup.select("tbody > tr")]
# I use the years variable to store the information of the athlete's years of activity, 
#which is different for each athlete. This way I can create a generic scraping that works for most players
years = len(lenght[1])
print(years)

In [None]:
# 5 dictionaries, each for each table
Traditional_Splits  = dict()
Advanced_Splits = dict()
Misc_Splits = dict()
Scoring_Splits = dict()
Usage_Splits = dict() 

# clean the variable from unnecessary fields, to facilitate the process of creating dictionaries
new_row = []
for x in row:
    if len(x)>1:
        new_row.append(x)

def creacolumnas(columnas, empty_dict):
    # this function allows you to create the keys of each dictionary
    for column in columnas:
        empty_dict[column] = []  
        
def creafilas(row, numpari, empty_dict):
    # this function allows you to fill the data tables
    for i,value in enumerate(empty_dict.keys()):
        for x in range(years):
            try:
                empty_dict[value].append(row[x+(numpari*years)][i])
            except:
                pass

# Two types of data depending on the players


On the web, data are organized in different ways. The active players have 5 tables of statistics while almost all the players no longer in activity have only 4 tables, which report distinct data (table 4 is not reported because it is not very interesting for statistical purposes.

For this reason there are two types of filtering of the columns, so as not to cause errors when downloading data

## Recent players

<img src="inputs/Lebron.png" width="300">

In [None]:
# an additional procedure for cleaning the keys that will be inserted in each dictionary
columnas_tabla1 =columnas[0][1::]
columnas_tabla2 =columnas[2][1:11] + columnas[2][13::]
columnas_tabla3 =columnas[4][1:5] + columnas[4][6:7] + columnas[4][8:9] + columnas[4][10:13] + columnas[4][15:16] + columnas[4][17::]
columnas_tabla4 =columnas[6][1:9] + columnas[6][10::]
columnas_tabla4[7] = '%PTS2PTMR'
columnas_tabla5 = columnas[8][1::]
# a print example
print(columnas_tabla1)

In [None]:
# call the functions previously created for each table
creacolumnas(columnas_tabla1, Traditional_Splits )
creafilas(new_row, 0, Traditional_Splits )
#print(Traditional_Splits)
creacolumnas(columnas_tabla2, Advanced_Splits )
creafilas(new_row, 1, Advanced_Splits )
#print(Advanced_Splits)
creacolumnas(columnas_tabla3, Misc_Splits )
creafilas(new_row, 2, Misc_Splits )
#print(Misc_Splits)
creacolumnas(columnas_tabla4, Scoring_Splits )
creafilas(new_row, 3, Scoring_Splits )
#print(Scoring_Splits)
creacolumnas(columnas_tabla5, Usage_Splits )
creafilas(new_row, 4, Usage_Splits )
#print(Usage_Splits)

In [None]:
table1 = pd.DataFrame(Traditional_Splits)
table1.head()

In [None]:
table2  = pd.DataFrame(Advanced_Splits)
table2.head()

In [None]:
table3 = pd.DataFrame(Misc_Splits)
table3.head()

In [None]:
table4 = pd.DataFrame(Scoring_Splits)
table4.head()

In [None]:
table5 = pd.DataFrame(Usage_Splits)
table5.head()

## Career players


<img src="inputs/Jordan.jpeg">

In [None]:
# Example with Michael Jordan stats
soup2 = getPage("https://stats.nba.com/player/893/career/")

In [None]:
# different way to scraping the webpabge, using CSS selector or tags
# Ex.1 - just columns name
columnas = [table.text.split() for table in soup2.select("thead")]
# Ex.2 - all value in the table
lenght = [table.text.split() for table in soup2.select("tbody")]
row = [table.text.split() for table in soup2.select("tbody > tr")]
# I use the years variable to store the information of the athlete's years of activity, 
#which is different for each athlete. This way I can create a generic scraping that works for most players
years = len(lenght[1])
print(years)

In [None]:
columnas_tabla1_carrer =columnas[0]
columnas_tabla2_carrer =columnas[2]
columnas_tabla3_carrer =columnas[4]
print(columnas_tabla3_carrer)

In [None]:
# 3 dictionaries, each for each table
Career_Regular_Season_Stats  = dict()
Career_Playoffs_Stats = dict()
Career_All_Star_Stats = dict()
# clean the variable from unnecessary fields, to facilitate the process of creating dictionaries
new_row = []
for x in row:
    if len(x)>1:
        new_row.append(x)

In [None]:
creacolumnas(columnas_tabla1_carrer, Career_Regular_Season_Stats )
creafilas(new_row, 0, Career_Regular_Season_Stats )
#print(Career_Regular_Season_Stats)
creacolumnas(columnas_tabla2_carrer, Career_Playoffs_Stats )
creafilas(new_row, 1, Career_Playoffs_Stats )
#print(Career_Playoffs_Stats)
creacolumnas(columnas_tabla3_carrer, Career_All_Star_Stats )
creafilas(new_row, 2, Career_All_Star_Stats )
#print(Career_All_Star_Stats)

In [None]:
table1_carrer = pd.DataFrame(Career_Regular_Season_Stats)
table1_carrer.head()

In [None]:
table2_carrer = pd.DataFrame(Career_Playoffs_Stats)
table2_carrer.head()

In [None]:
table3_carrer = pd.DataFrame(Career_All_Star_Stats)
table3_carrer.head()

# Merge dataset with Selenium infos

In [None]:
# Change the type of each column in float in order to calculate its average of values
table1 = table1[['GP','MIN','PTS','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%',
        'OREB','DREB','REB','AST','TOV','STL','BLK','PF']].astype(float)
# The new selenium dataset with the average statistics of the player's entire career
table1_mean_stats = pd.DataFrame(table1.mean()).T
table1_mean_stats

In [None]:
table1_mean_stats['Player'] = f'{first_name} {last_name}'
# Rename the column with the common data
#table_api = table_api.rename({'player_name':'Player'}, axis=1)
# Merge of the two tables
query_player_final = pd.merge(query_player_update, table1_mean_stats, on='Player', how='outer')

In [None]:
# Result compared to the original data
display(query_player)
display(query_player_final)

In [None]:
# Change the type of each column in float in order to calculate its average of values
table1_carrer = table1_carrer[['GP','MIN','PTS','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%',
        'OREB','DREB','REB','AST','TOV','STL','BLK','PF']].astype(float)

In [None]:
table1_carrer_mean_stats = pd.DataFrame(table1_carrer.mean()).T
table1_carrer_mean_stats