In [1]:
# pandas for data manipulation and analysis
import pandas as pd
# numpy to manipulate multi-dimensional arrays and matrices
import numpy as np
# re - regex to provides regular expression matching operations
import re
# the OS module in Python provides a way of using operating system dependent functionality
import os
# the requests module allows you to send HTTP requests using Python
import requests
# python-dotenv to read the key-value pair from . env file and adds them to environment variable.
from dotenv import load_dotenv
#Beautiful Soup is a Python library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup
# The selenium package is used to automate web browser interaction from Python
from selenium import webdriver
# to simplify management of binary drivers for different browsers
#import chromedriver_binary 
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def import_dataset():
    df = pd.read_csv('inputs/datasets_Nba_Players.csv', encoding = 'latin-1')
    df = df.rename({'collage':'college'}, axis=1)
    return df

In [3]:
def save_api_key_in_env():
    load_dotenv()
    apikey = os.getenv("APIKEY")
    return apikey

In [4]:
def url_web_api():
    # enviar una request. Si sale un numero que empieza con 2 todo bien, 
    # o si no ver los errores de los gatos a ver que dicen!
    url = 'http://api.probasketballapi.com/player'
    res = requests.post(url)
    return url

In [5]:
def player_name():
    first_name = input('Insert the name of an NBA player')
    last_name = input('Insert the last_name of an NBA player')
    return first_name, last_name

In [6]:
def call_player(player_name):
    try:
        query = {'api_key': f'{apikey}', 'first_name':f'{player_name[0]}', 'last_name':f'{player_name[1]}'} 
        r = requests.post(url, data=query)
        player_api =r.json()[0]
        table_api = pd.DataFrame(player_api, index=[0])
        return table_api.head()
    except:
        print('Player not in database. Try another one')
        return call_player()

In [58]:
def merge_info_api(player_name, player_table):
    query_player = df[(df['Player']==f'{player_name[0]} {player_name[1]}')]
    # Rename the column with the common data
    player_table = player_table.rename({'player_name':'Player'}, axis=1)
    # Merge of the two tables
    query_player_update = pd.merge(query_player, player_table, on='Player', how='outer')
    # Eliminate redundant information
    query_player_update = query_player_update[['Unnamed: 0', 'Player', 'first_name', 'last_name', 
    'born', 'birth_date','birth_city', 'college','height', 'weight','position', 'dk_position']]
    # Result compared to the original data)
    return query_player_update

In [8]:
def getPage(url):
    # This function show a basic way to make a get requests used Selenium
    # First line to to avoid this error:
    # WebDriverException: Message: 'chromedriver' executable needs to be available in the path.
    # driver.execute_script(..) to scroll the page in Selenium untill the bottom
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #box = driver.find_element_by_xpath('/html/body/main/div[1]/div/div[1]/div[1]/input')
    #box.send_keys('LeBron James')
    html = driver.page_source
    soup = BeautifulSoup(html)
    driver.quit()
    return soup

In [9]:
# different way to scraping the webpabge, using CSS selector or tags
# Ex.1 - just columns name
def scraping_columnas(soup):
    columnas = [table.text.split() for table in soup.select("thead")]
    return columnas
def scraping_lenght(soup):
    # Ex.2 - all value in the table
    lenght = [table.text.split() for table in soup.select("tbody")]
    return lenght
def scraping_row(soup):
    row = [table.text.split() for table in soup.select("tbody > tr")]
    return row
# I use the years variable to store the information of the athlete's years of activity, 
#which is different for each athlete. This way I can create a generic scraping that works for most players

In [10]:
def cleaning_variable(row):
    # clean the variable from unnecessary fields, to facilitate the process of creating dictionaries
    new_row = []
    for x in row:
        if len(x)>1:
            new_row.append(x)
    return new_row

def creacolumnas(columnas, empty_dict):
    # this function allows you to create the keys of each dictionary
    for column in columnas:
        empty_dict[column] = []  
        
def creafilas(row, numpari, empty_dict, years):
    # this function allows you to fill the data tables
    for i,value in enumerate(empty_dict.keys()):
        for x in range(years):
            try:
                empty_dict[value].append(row[x+(numpari*years)][i])
            except:
                pass

In [11]:
def cleaning_tabla1(columnas, number):
    # an additional procedure for cleaning the keys that will be inserted in each dictionary
    columnas_tabla1 =columnas[0][number::]
    return columnas_tabla1

In [12]:
def create_selenium_player(number):
    soup1 = getPage(f"https://stats.nba.com/player/{number}")
    print(number)
    columnas_player1 = scraping_columnas(soup1)
    lenght_player1 = scraping_lenght(soup1)
    row_player1 = scraping_row(soup1)
    years_player1 = len(lenght_player1[1])
    Traditional_Splits_player1  = dict()
    new_row_player1 = cleaning_variable(row_player1)
    if 'career' in str(number):
        columnas_tabla_player1 = cleaning_tabla1(columnas_player1, 0)
    else:
        columnas_tabla_player1 = cleaning_tabla1(columnas_player1, 1)
    creacolumnas(columnas_tabla_player1, Traditional_Splits_player1 )
    creafilas(new_row_player1, 0, Traditional_Splits_player1 , years_player1)
    table1 = pd.DataFrame(Traditional_Splits_player1)
    return table1

In [13]:
def merge_selenium_table(table):
    # Change the type of each column in float in order to calculate its average of values
    table = table[['GP','MIN','PTS','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%',
        'OREB','DREB','REB','AST','TOV','STL','BLK','PF']].astype(float)
    # The new selenium dataset with the average statistics of the player's entire career
    table_mean_stats = pd.DataFrame(table.mean()).T
    return table_mean_stats

In [34]:
def create_new_columns_to_merge_dataset(players_name, table):
    table['Player'] = f'{players_name[0]} {players_name[1]}'
    return table

In [49]:
def merge_info_selenium(table_before_selenium, table_selenium):
    # Merge of the two tables
    final_table = pd.merge(table_before_selenium, table_selenium, on='Player', how='outer')
    return final_table

In [15]:
df = import_dataset()

In [16]:
apikey = save_api_key_in_env()

In [17]:
url = url_web_api()

In [18]:
players1_name = player_name()
players1_table = call_player(players1_name)

Insert the name of an NBA playerLeBron
Insert the last_name of an NBA playerJames


In [19]:
players2_name = player_name()
players2_table = call_player(players2_name)

Insert the name of an NBA playerJames
Insert the last_name of an NBA playerHarden


In [59]:
player1_table_api = merge_info_api(players1_name, players1_table)

In [60]:
player2_table_api = merge_info_api(players2_name, players2_table)

# Selenium

In [22]:
table_player1 = create_selenium_player(2544)

[WDM] - Current google-chrome version is 81.0.4044
[WDM] - Get LATEST driver version for 81.0.4044


 


[WDM] - Driver [/home/must4in3/.wdm/drivers/chromedriver/linux64/81.0.4044.138/chromedriver] found in cache


2544


In [23]:
table_player1.head()

Unnamed: 0,Year,TEAM,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,2019-20,LAL,60,34.9,25.7,9.8,19.6,49.8,2.2,6.4,...,7.9,10.6,4.0,1.2,0.5,1.8,52.3,42,13,8.1
1,2018-19,LAL,55,35.2,27.4,10.1,19.9,51.0,2.0,5.9,...,8.5,8.3,3.6,1.3,0.6,1.7,52.0,32,8,2.1
2,2017-18,CLE,82,36.9,27.5,10.5,19.3,54.2,1.8,5.0,...,8.6,9.1,4.2,1.4,0.9,1.7,54.1,52,18,1.3
3,2016-17,CLE,74,37.8,26.4,9.9,18.2,54.8,1.7,4.6,...,8.6,8.7,4.1,1.2,0.6,1.8,51.3,42,13,6.5
4,2015-16,CLE,76,35.6,25.3,9.7,18.6,52.0,1.1,3.7,...,7.4,6.8,3.3,1.4,0.6,1.9,47.1,28,3,8.1


In [25]:
table_player2 = create_selenium_player('893/career/')

[WDM] - Current google-chrome version is 81.0.4044
[WDM] - Get LATEST driver version for 81.0.4044


 


[WDM] - Driver [/home/must4in3/.wdm/drivers/chromedriver/linux64/81.0.4044.138/chromedriver] found in cache


893/career/


In [26]:
table_player2.head()

Unnamed: 0,Season,TEAM,AGE,GP,GS,MIN,PTS,FGM,FGA,FG%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF
0,2002-03,WAS,40,82,67,36.9,20.0,8.3,18.6,44.5,...,4.0,82.1,0.9,5.2,6.1,3.8,1.5,0.5,2.1,2.1
1,2001-02,WAS,39,60,53,34.8,22.9,9.2,22.1,41.6,...,5.6,79.0,0.8,4.8,5.7,5.2,1.4,0.4,2.7,2.0
2,1997-98,CHI,35,82,82,38.8,28.7,10.7,23.1,46.5,...,8.8,78.4,1.6,4.2,5.8,3.5,1.7,0.5,2.3,1.8
3,1996-97,CHI,34,82,82,37.9,29.6,11.2,23.1,48.6,...,7.0,83.3,1.4,4.5,5.9,4.3,1.7,0.5,2.0,1.9
4,1995-96,CHI,33,82,82,37.7,30.4,11.2,22.6,49.5,...,8.0,83.4,1.8,4.8,6.6,4.3,2.2,0.5,2.4,2.4


# Merge dataset with Selenium infos

In [39]:
table_player1_selenium = merge_selenium_table(table_player1)

In [40]:
table_player1_selenium.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF
0,74.0,38.282353,27.070588,9.876471,19.588235,50.588235,1.494118,4.335294,34.382353,5.823529,7.958824,73.176471,1.194118,6.282353,7.452941,7.435294,3.511765,1.588235,0.741176,1.835294


In [41]:
table_player2_selenium = merge_selenium_table(table_player2)

In [42]:
table_player2_selenium.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF
0,71.466667,37.566667,29.453333,11.093333,22.7,48.826667,0.54,1.62,28.393333,6.713333,8.066667,83.093333,1.513333,4.593333,6.12,5.14,2.686667,2.293333,0.833333,2.593333


In [43]:
table_player1_selenium = create_new_columns_to_merge_dataset(players1_name, table_player1_selenium)

In [44]:
table_player2_selenium = create_new_columns_to_merge_dataset(players2_name, table_player2_selenium)

In [47]:
table_player1_selenium.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,Player
0,74.0,38.282353,27.070588,9.876471,19.588235,50.588235,1.494118,4.335294,34.382353,5.823529,...,73.176471,1.194118,6.282353,7.452941,7.435294,3.511765,1.588235,0.741176,1.835294,LeBron James


In [48]:
table_player2_selenium.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,Player
0,71.466667,37.566667,29.453333,11.093333,22.7,48.826667,0.54,1.62,28.393333,6.713333,...,83.093333,1.513333,4.593333,6.12,5.14,2.686667,2.293333,0.833333,2.593333,James Harden


In [63]:
table_player1_final = merge_info_selenium(player1_table_api, table_player1_selenium)
table_player2_final = merge_info_selenium(player2_table_api, table_player2_selenium)
table_player1_final.head()

Unnamed: 0.1,Unnamed: 0,Player,first_name,last_name,born,birth_date,birth_city,college,height,weight,...,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF
0,2944,LeBron James,LeBron,James,1984.0,1984-12-30,Akron,,203.0,113.0,...,7.958824,73.176471,1.194118,6.282353,7.452941,7.435294,3.511765,1.588235,0.741176,1.835294


In [64]:
table_player2_final.head()

Unnamed: 0.1,Unnamed: 0,Player,first_name,last_name,born,birth_date,birth_city,college,height,weight,...,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF
0,3357,James Harden,James,Harden,1989.0,1989-08-26,Los Angeles,Arizona State University,196.0,99.0,...,8.066667,83.093333,1.513333,4.593333,6.12,5.14,2.686667,2.293333,0.833333,2.593333
