# Dada la pagina de un jugador, extrae los datos. Usando bs4 y requests.

Ejemplo Omar Linares: "https://www.beisbolcubano.cu/estadisticas/Jugador?idJugador=2301"

In [1]:
# Import packages
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [43]:
# Auxiliary functions

# Filter the data
def filter_data(data, keywords):
    """
    Filter the data by keywords en columna `Etapa`
    """
    for keyword in keywords:
        data = data[data['Etapa'].str.contains(keyword)]
    return data

In [51]:
# Omar Linares
nombre = "Omar Linares"
idJugador = "2301"

def retrive_player_stats(idJugador, nombre):

    url = f'https://www.beisbolcubano.cu/estadisticas/Jugador?idJugador={idJugador}'
    tables = pd.read_html(url)

    # Relevant tables are 52 73 and 91
    relevant_tables = [52, 73, 91]
    # Concate all tables using column "0" and "1" as common columns
    stats = pd.concat([tables[i] for i in relevant_tables], axis=1, join='inner')
    # Use the first row as column names
    stats.columns = stats.iloc[0]
    # Drop the first row
    stats = stats.drop(stats.index[0])
    # Eliminate duplicate columns
    stats = stats.loc[:, ~stats.columns.duplicated(keep='first')]
    stats = filter_data(stats, ["Serie Nacional", "Regular"])
    # Conveert "Etapa" to the year of the season (e.g. 2019-2020 -> 2019)
    stats['Etapa'] = stats['Etapa'].str.extract('(\d+)', expand=False)
    # Add `Nombre` and `idJugador` columns
    stats['Nombre'] = nombre
    stats['idJugador'] = idJugador
    # Reorder columns so that `Nombre` and `idJugador` are the first two columns
    stats = stats[['idJugador', 'Nombre'] + [col for col in stats.columns if col not in ['Nombre', 'idJugador']]]

    # Rename "Etapa" => "Temporada"
    stats = stats.rename(columns={'Etapa': 'Temporada'})

    return stats

# Get the data
retrive_player_stats(idJugador, nombre)

Unnamed: 0,idJugador,Nombre,Temporada,Equipo,JJ,CB,VB,CA,H,AVE,...,IJ,O,A,E,TL,FAVE,DP,TP,PB,FR
2,2301,Omar Linares,1991,VEG,45,193,140,44,54,0.386,...,1145,52,111,7,170,0.959,12,0,0,3.84
4,2301,Omar Linares,1992,PRI,59,258,184,63,82,0.446,...,1408,47,127,6,180,0.967,15,0,0,3.34
7,2301,Omar Linares,1993,PRI,55,241,180,58,68,0.378,...,1256,36,111,3,150,0.98,13,0,0,3.16
10,2301,Omar Linares,1994,PRI,58,255,189,63,66,0.349,...,1313,40,111,7,158,0.956,15,0,0,3.11
13,2301,Omar Linares,1995,PRI,58,259,181,63,73,0.403,...,1252,43,88,9,140,0.936,12,0,0,2.83
17,2301,Omar Linares,1996,PRI,32,139,106,29,40,0.377,...,594,19,49,6,74,0.919,8,0,0,3.09
21,2301,Omar Linares,1997,PRI,45,182,138,38,52,0.377,...,1011,25,68,10,103,0.903,7,0,0,2.48
23,2301,Omar Linares,1998,PRI,25,113,87,18,25,0.287,...,516,10,37,6,53,0.887,6,0,0,2.46
25,2301,Omar Linares,1999,PRI,77,322,246,41,78,0.317,...,1947,33,135,12,180,0.933,14,0,0,2.33
27,2301,Omar Linares,2000,PRI,58,260,182,47,71,0.39,...,1383,36,135,6,177,0.966,15,0,0,3.34


In [52]:
# Validate the data with a different player: 
retrive_player_stats("3568", "Alexander Malleta")

Unnamed: 0,idJugador,Nombre,Temporada,Equipo,JJ,CB,VB,CA,H,AVE,...,IJ,O,A,E,TL,FAVE,DP,TP,PB,FR
1,3568,Alexander Malleta,1995,IND,16,57,52,7,16,0.308,...,111,40,2,0,42,1.000,4,0,0,10.22
2,3568,Alexander Malleta,1996,IND,33,111,98,19,24,0.245,...,586,213,4,5,222,.977,21,0,0,10.0
5,3568,Alexander Malleta,1998,MET,30,96,84,9,20,0.238,...,524,119,0,2,121,.983,7,0,0,6.13
6,3568,Alexander Malleta,2000,IND,65,214,192,31,45,0.234,...,821,159,6,2,167,.988,13,0,0,5.43
8,3568,Alexander Malleta,2001,IND,35,78,69,9,21,0.304,...,6,0,0,0,0,-,0,0,0,0.0
10,3568,Alexander Malleta,2002,MET,85,332,282,51,88,0.312,...,138,9,1,2,12,.833,0,0,0,1.96
13,3568,Alexander Malleta,2003,MET,82,327,281,39,81,0.288,...,24,0,0,0,0,-,0,0,0,0.0
16,3568,Alexander Malleta,2004,IND,89,392,321,49,106,0.33,...,54,21,0,0,21,1.000,3,0,0,10.5
20,3568,Alexander Malleta,2005,IND,76,302,238,51,75,0.315,...,360,121,6,1,128,.992,12,0,0,9.52
22,3568,Alexander Malleta,2006,IND,89,373,282,41,80,0.284,...,69,28,0,0,28,1.000,4,0,0,10.96


TODO:
- [ ] Hay temporadas donde el jugaddor jugo en dos equipos. 
  - Solucion: Addicionar las stats contables y recalcular los averages. Cambiar el nombre del equipo a "Multi" o algo así.

# Obtener los ids de los jugadores

In [56]:
# Scrape:
url = "https://www.beisbolcubano.cu/estadisticas/labor_vida?act=0"
# Using rquest and BeautifulSoup
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

In [57]:
soup


<!DOCTYPE html>

<html lang="es">
<head><link href="/DXR.axd?r=1_74,1_68,1_73,1_254,1_253,1_210-OyCbp" rel="stylesheet" type="text/css"/><link href="/DXR.axd?r=1_207,1_209,1_206-OyCbp" rel="stylesheet" type="text/css"/>
<!-- PAGE TITLE -->
<title>
	SNB | Labor de por Vida
</title><meta content="text/html;charset=utf-8" http-equiv="Content-Type"/><meta content="document" name="resource-type"/><meta content="1 days" name="revisit-after"/><meta content="Internet" name="classification"/><meta content="all" name="robots"/><meta content="all" name="googlebot"/><meta content="Global" name="distribution"/><meta content="General" name="rating"/><meta content="es" name="language"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><meta content="SITIO WEB OFICIAL BEISBOL CUBANO, INDER CUBA" name="description"/><meta content="SITIO WEB OFICIAL BEISBOL CUBANO, INDER CUBA, Deporte, Inder, Educacion Fisica, Cultura Fisica, Deporte Cubano" name="keywords"/>
<style>
        .tab_D

In [60]:
(80 * 12) / 16

60.0

In [61]:
60 * 4

240

In [62]:
(80 * 12)

960

In [64]:
1000 / 8 

125.0