In [2]:
%pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime

In [5]:
def sys_message(texto):
    print('\033[31m' + texto + '\033[0m')
    '''
    cores = {
        'vermelho': '\033[31m',
        'verde': '\033[32m',
        'amarelo': '\033[33m',
        # Adicione outras cores conforme necessário
        'end': '\033[0m'
    } 
    '''
# Exemplo de uso:
sys_message("Este texto está vermelho")

[31mEste texto está vermelho[0m


In [6]:
def get_soup(url: str) -> BeautifulSoup | None:
    '''Return de BeautifulSoup object from a url'''

    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    }

    if(type(url) != str):
        sys_message('URL must be a string')
        return None

    page = requests.get(url, headers=headers)
    if page.status_code != 200:
        sys_message(f'Error {page.status_code}')
        return None
    return BeautifulSoup(page.content, 'html.parser')

In [7]:
def verify_page(url: str) -> BeautifulSoup | None:
    '''Verify if the page is empty'''
    page = get_soup(url)

    if page is None:
        return None
    elif not page.find('h1', class_='Error404__Title') is None:
        sys_message('Data not found')
        return None
    return page

In [8]:
def get_games(url: str) -> list | None:
    '''Get the IDs of all games from a specific URL'''
    
    page = verify_page(url)
    
    if page is None:
        return None
    elif not page.find('h4', class_='n5 tc pv6 clr-gray-05') is None:
        sys_message('No data found')
        return None
    
    links = page.find_all('a', class_='AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100 mr2')
    ids = [l['href'].replace('/futebol/partida-estatisticas/_/jogoId/','') for l in links if 'partida-estatisticas' in l['href']]
    if not ids:
        sys_message('No links found')
        return None
    return ids

In [9]:
def get_all_games(from_date: int, to_date: int = 0) -> dict | None:
    '''
    Gets all links to games within a specific date range.
    If no end date is given, it is considered the current date.
    Use from_date < to_date .
    '''
    today = int(datetime.date.today().strftime('%Y%m%d'))
    if to_date == 0:
        to_date = today
    elif from_date > today or to_date > today:
        sys_message('Date is greater than today. This function returns only past games for security reasons.')
        return None

    if from_date > to_date:
        sys_message('Initial date is greater than end date')
        return None
    
    pages = {}
    for date in range(from_date, to_date+1):
        url = f'https://www.espn.com.br/futebol/resultados/_/data/{date}/liga/bra.1'
        print(f'Getting data from {url}:')
        links = get_games(url)
        if links is not None:
            print(f'{len(links)} links found')
        pages[date] = links
    return pages

In [10]:
print(get_all_games(20241023))

Getting data from https://www.espn.com.br/futebol/resultados/_/data/20241023/liga/bra.1:
[31mNo data found[0m
Getting data from https://www.espn.com.br/futebol/resultados/_/data/20241024/liga/bra.1:
1 links found
Getting data from https://www.espn.com.br/futebol/resultados/_/data/20241025/liga/bra.1:
[31mNo data found[0m
Getting data from https://www.espn.com.br/futebol/resultados/_/data/20241026/liga/bra.1:
8 links found
Getting data from https://www.espn.com.br/futebol/resultados/_/data/20241027/liga/bra.1:
[31mNo data found[0m
Getting data from https://www.espn.com.br/futebol/resultados/_/data/20241028/liga/bra.1:
2 links found
{20241023: None, 20241024: ['699530'], 20241025: None, 20241026: ['699654', '699655', '699656', '699659', '699650', '699651', '699657', '699652'], 20241027: None, 20241028: ['699653', '699658']}


In [11]:
def get_datas_from_estatisticas(id: str) -> dict | None:
    '''Get the stats from a game by the page Estatisticas'''

    if type(id) != str:
        sys_message('ID must be a string')
        return None

    url = f'https://www.espn.com.br/futebol/partida-estatisticas/_/jogoId/{id}'
    page = verify_page(url)
    
    team1 = {}
    team2 = {}

    # Finding team names
    aux = page.find_all('h2', class_='ScoreCell__TeamName ScoreCell__TeamName--displayName db')
    aux = [name.text for name in aux]
    
    team1['time'], team2['time'] = aux[0], aux[1]

    # Finding gols
    aux = page.find_all('div', class_='Gamestrip__Score relative tc w-100 fw-heavy-900 h2 clr-gray-01')
    aux = [gol.text for gol in aux]
    for i in range(len(aux)):
        index = aux[i].find('V') # Removing extra information
        if index != -1:
            aux[i] = aux[i][:index]
    team1['gols'], team2['gols'] = aux[0], aux[1]

    # Finding team stats
    labels = ['chute a gol', 'chute', 'faltas', 'amarelos', 'vermelhos', 'escanteios', 'defesas']
    aux = page.find_all('span', class_='bLeWt ZfQkn JoGSb hsDdd ICQCm')
    aux = [stat.text for stat in aux]
    
    count = 0
    for i in range(0, len(labels)*2, 2):
        if labels[count] in ['chute a gol', 'chute', 'defesas']: # Other information can be extracted from the game's plays
            team1[labels[count]], team2[labels[count]] = aux[i], aux[i+1]
        count += 1
    
    # Finding possession
    aux = page.find('span', class_='bLeWt ZfQkn JoGSb VZTD pgHdv uHRs')
    team1['posse'] = aux.text.replace('%','')

    aux = page.find('span', class_='bLeWt ZfQkn JoGSb VZTD nljvg')
    team2['posse'] = aux.text.replace('%','')

    # Finding general information about the game
    general_inf = {} 

    aux = page.find('div', class_='ScoreCell__GameNote di')
    general_inf['campeonato'] = aux.text

    aux = page.find('div', class_='n6 clr-gray-03 GameInfo__Location__Name--noImg')
    general_inf['estadio'] = aux.text

    aux = page.find('div', class_='n8 GameInfo__Meta')
    general_inf['data'] = aux.find('span').text

    aux = page.find('span', class_='Location__Text')
    general_inf['local'] = aux.text

    aux = page.find('div', class_='Attendance__Numbers')
    general_inf['audiencia'] = aux.text.replace('Attendance:','')

    aux = page.find('li', class_='GameInfo__List__Item')
    general_inf['arbitro'] = aux.text

    general_inf['mandante'] = team1
    general_inf['visitante'] = team2

    return general_inf
    

In [12]:
result = pd.json_normalize(get_datas_from_estatisticas('699615'))
result

Unnamed: 0,campeonato,estadio,data,local,audiencia,arbitro,mandante.time,mandante.gols,mandante.chute a gol,mandante.chute,mandante.defesas,mandante.posse,visitante.time,visitante.gols,visitante.chute a gol,visitante.chute,visitante.defesas,visitante.posse
0,2024 Brasileiro Serie A,Castelão,"20:00, 21 de setembro, 2024","Fortaleza, Brasil",24774,Raphael Claus,Fortaleza,4,7,11,3,36.5,Bahia,1,4,9,3,63.5


In [13]:
'''def get_data_from_comment(index: int, text: str, pattern: str, minute: str) -> None:
    key = f'lance-{index}'
    datas = {key:{'jogador-1':'', 'jogador-2':'', 'time':'', 'tipo':'', 'descricao':'', 'minuto':''}}

    datas[key]['minuto'] = minute
    match(pattern):
        case "Falta cometida":
            point = text.find('(') # Dot where is the team name
            datas[key]['jogador-1'] = text[19:point-1]
            datas[key]['time'] = text[point+1:len(text)-2] # removing the parentheses
            datas[key]['tipo'] = 'FALTA-FEITA'

        case "sofre uma falta":
            point = text.find('(')
            datas[key]['jogador-2'] = text[:point-1]
            datas[key]['time'] = text[point+1:text.find(')')]
            datas[key]['tipo'] = 'FALTA-SOFRIDA'
    
        case "Oportunidade perdida":
            point = text.find('(')
            datas[key]['jogador-1'] = text[21:point-1]
            datas[key]['time'] = text[point+1:text.find(')')]
            datas[key]['tipo'] = 'GOL-PERDIDO'
            description = text[text.find(','):text.find('.')]
            if 'cabeça' in description: # Checking the type of shot
                datas[key]['descricao'] = 'CABECEIO'
            elif 'pé direito' in description: 
                datas[key]['descricao'] = 'CHUTE (pé direito)'
            else:
                datas[key]['descricao'] = 'CHUTE (pé esquerdo)'
            
        case "Escanteio":
            point = text.find('.')
            datas[key]['time'] = text[11:point]
            datas[key]['jogador-2'] = text[point+13:-1]
            datas[key]['tipo'] = 'ESCANTEIO'

        case "Impedimento":
            point = text.find('.')
            datas[key]['jogador-1'] = text[point+2:text.find('tentou')-1]
            datas[key]['jogador-2'] = text[text.find('encontrou')+10:text.find(' em posição')]
            datas[key]['time'] = text[13:point]
            datas[key]['tipo'] = 'IMPEDIMENTO'
            
        case "cartão":
            point = text.find('(')
            datas[key]["jogador-1"] = text[:point-1]
            datas[key]['time'] = text[point+1:text.find(')')]
            if 'por' in text:
                datas[key]['descricao'] = text[text.find('por', point):-1]
            if 'amarelo' in text:
                datas[key]['tipo'] = 'CARTAO-AMARELO'
            elif 'vermelho' in text:
                datas[key]['tipo'] = 'CARTAO-VERMELHO'
                
        case "Gol":
            point = text.find(' (')
            datas[key]["jogador-1"] = text[text.find('.')+2: point]
            if 'Assistência' in text:
                datas[key]['jogador-2'] = text[text.find('Assistência ')+15:-1]
            datas[key]['time'] = text[point+2:text.find(')')]
            descroption = text[text.find(')')+2:text.find('gol.')+3]
            if 'direito' in descroption:
                datas[key]['descricao'] = "pé direito"
            elif 'esquerdo' in descroption:
                datas[key]['descricao'] = "pé esquerdo"
            elif 'cabeça' in descroption:
                datas[key]['descricao'] = "cabeça"
            else:
                datas[key]['descricao'] = text[text.find(')')+2:text.find('gol.')+3]
            datas[key]['tipo'] = 'GOL'
            
        case "Substituição":
            point = text.find('substituindo')
            datas[key]['jogador-1'] = text[text.find('campo')+6:point-1]
            datas[key]['jogador-2'] = text[point+13:text.find('.', point)]
            datas[key]['time'] = text[13:text.find(',')]
            datas[key]['tipo'] = 'SUBSTITUICAO'
            
        # case "Finalização defendida":
        #     pass

        case "lesão":
            point = text.find('(')
            datas[key]['jogador-1'] = text[text.find('lesão de')+9:point-1]
            datas[key]['time'] = text[point+1:text.find(')', point)]
            datas[key]['tipo'] = 'LESAO'

        # case "acréscimo":
        #     pass

        case "Fim do primeiro":
            datas[key]['tipo'] = 'ENCERRAMENTO-1'

        case "Fim do segundo":
            datas[key]['tipo'] = 'ENCERRAMENTO-2'
            
        case _:
            return
    return datas'''

'def get_data_from_comment(index: int, text: str, pattern: str, minute: str) -> None:\n    key = f\'lance-{index}\'\n    datas = {key:{\'jogador-1\':\'\', \'jogador-2\':\'\', \'time\':\'\', \'tipo\':\'\', \'descricao\':\'\', \'minuto\':\'\'}}\n\n    datas[key][\'minuto\'] = minute\n    match(pattern):\n        case "Falta cometida":\n            point = text.find(\'(\') # Dot where is the team name\n            datas[key][\'jogador-1\'] = text[19:point-1]\n            datas[key][\'time\'] = text[point+1:len(text)-2] # removing the parentheses\n            datas[key][\'tipo\'] = \'FALTA-FEITA\'\n\n        case "sofre uma falta":\n            point = text.find(\'(\')\n            datas[key][\'jogador-2\'] = text[:point-1]\n            datas[key][\'time\'] = text[point+1:text.find(\')\')]\n            datas[key][\'tipo\'] = \'FALTA-SOFRIDA\'\n    \n        case "Oportunidade perdida":\n            point = text.find(\'(\')\n            datas[key][\'jogador-1\'] = text[21:point-1]\n        

In [14]:
def get_data_from_comment(index: int, text: str, minute: str) -> None:
    key = index
    datas = {key:{'jogador-1':None, 'jogador-2':None, 'time':None, 'tipo':None, 'descricao':None, 'minuto':''}}

    datas[key]['minuto'] = minute
    if "Falta cometida" in text:
        point = text.find('(') # Dot where is the team name
        datas[key]['jogador-1'] = text[19:point-1]
        datas[key]['time'] = text[point+1:len(text)-2] # removing the parentheses
        datas[key]['tipo'] = 'FALTA-FEITA'

    elif "sofre uma falta" in text:
        point = text.find('(')
        datas[key]['jogador-2'] = text[:point-1]
        datas[key]['time'] = text[point+1:text.find(')')]
        datas[key]['tipo'] = 'FALTA-SOFRIDA'

    elif "Oportunidade perdida" in text:
        point = text.find('(')
        datas[key]['jogador-1'] = text[21:point-1]
        datas[key]['time'] = text[point+1:text.find(')')]
        datas[key]['tipo'] = 'GOL-PERDIDO'
        description = text[text.find(','):text.find('.')]
        if 'cabeça' in description: # Checking the type of shot
            datas[key]['descricao'] = 'CABECEIO'
        elif 'pé direito' in description: 
            datas[key]['descricao'] = 'CHUTE (pé direito)'
        else:
            datas[key]['descricao'] = 'CHUTE (pé esquerdo)'

    elif "Escanteio" in text:
        point = text.find('.')
        datas[key]['time'] = text[11:point]
        datas[key]['jogador-2'] = text[point+13:-1]
        datas[key]['tipo'] = 'ESCANTEIO'

    elif "Impedimento" in text:
        point = text.find('.')
        datas[key]['jogador-1'] = text[point+2:text.find('tentou')-1]
        datas[key]['jogador-2'] = text[text.find('encontrou')+10:text.find(' em posição')]
        datas[key]['time'] = text[13:point]
        datas[key]['tipo'] = 'IMPEDIMENTO'

    elif "cartão" in text:
        point = text.find('(')
        datas[key]["jogador-1"] = text[:point-1]
        datas[key]['time'] = text[point+1:text.find(')')]
        if 'por' in text:
            datas[key]['descricao'] = text[text.find('por', point):-1]
        if 'amarelo' in text:
            datas[key]['tipo'] = 'CARTAO-AMARELO'
        elif 'vermelho' in text:
            datas[key]['tipo'] = 'CARTAO-VERMELHO'

    elif "Gol" in text:
        point = text.find(' (')
        datas[key]["jogador-1"] = text[text.find('.')+2: point]
        if 'Assistência' in text:
            datas[key]['jogador-2'] = text[text.find('Assistência ')+15:-1]
        datas[key]['time'] = text[point+2:text.find(')')]
        descroption = text[text.find(')')+2:text.find('gol.')+3]
        if 'direito' in descroption:
            datas[key]['descricao'] = "pé direito"
        elif 'esquerdo' in descroption:
            datas[key]['descricao'] = "pé esquerdo"
        elif 'cabeça' in descroption:
            datas[key]['descricao'] = "cabeça"
        else:
            datas[key]['descricao'] = text[text.find(')')+2:text.find('gol.')+3]
        datas[key]['tipo'] = 'GOL'

    elif "Substituição" in text:
        point = text.find('substituindo')
        datas[key]['jogador-1'] = text[text.find('campo')+6:point-1]
        datas[key]['jogador-2'] = text[point+13:text.find('.', point)]
        datas[key]['time'] = text[13:text.find(',')]
        datas[key]['tipo'] = 'SUBSTITUICAO'

    elif "lesão" in text:
        point = text.find('(')
        datas[key]['jogador-1'] = text[text.find('lesão de')+9:point-1]
        datas[key]['time'] = text[point+1:text.find(')', point)]
        datas[key]['tipo'] = 'LESAO'

    elif "Fim do primeiro" in text:
        datas[key]['tipo'] = 'ENCERRAMENTO-1'

    elif "Fim do segundo" in text:
        datas[key]['tipo'] = 'ENCERRAMENTO-2'
    
    else:
        return
    
    return datas

In [15]:
comment = 'Léo Naldi (Vitória) recebe cartão amarelo por uma entrada perigosa.'
print(get_data_from_comment(0,comment,"10'"))

{0: {'jogador-1': 'Léo Naldi', 'jogador-2': None, 'time': 'Vitória', 'tipo': 'CARTAO-AMARELO', 'descricao': 'por uma entrada perigosa', 'minuto': "10'"}}


In [16]:
'''def get_datas_from_comentarios(id: str) -> dict | None:
    

    if type(id) != str:
        sys_message('ID must be a string')
        return None

    url = f'https://www.espn.com.br/futebol/comentario/_/jogoId/{id}'
    page = get_soup(url)

    if page is None:
        return None
    elif not page.find('h1', class_='Error404__Title') is None:
        sys_message('Game not found')
        return None
    
    team1 = {}
    team2 = {}
    
    # Finding team names
    aux = page.find_all('h2', class_='ScoreCell__TeamName ScoreCell__TeamName--displayName db')
    aux = [name.text for name in aux]
    team1['name'], team2['name'] = aux[0], aux[1]
    print(team1['name'], team2['name'])

    # Getting comments
    minutes = page.find_all('div', class_='MatchCommentary__Comment__Timestamp')
    comments = page.find_all('div', class_='MatchCommentary__Comment__GameDetails')
    if len(minutes) != len(comments):
        sys_message('Data inconsistency')
        return None
    
    bids = {} # Dictionary to store the data from comments

    # Getting only relevant comments based on most used words/terms
    pattern_comments = ["Fim do", "Falta cometida", "sofre uma falta", "Oportunidade perdida", 
                        "Escanteio", "Impedimento", "cartão amarelo", "Gol", "Substituição", 
                        "lesão", "cartão vermelho"]

    N = len(pattern_comments)
    for comment, minute in zip(comments, minutes):
        index = len(bids)
        for i in range(N):
            if pattern_comments[i] in comment.text:
                data = get_data_from_comment(index ,comment.text, pattern_comments[i], minute)
                if data is not None:
                    bids.update(data)
                else:
                    sys_message('Error getting the data from comment:')
                    print(comment.text)
                break
        
        #     if team1['name'] in comment.text:
        #         team1['gols'][minutes[comments.index(comment)].text] = comment.text
        #     elif team2['name'] in comment.text:
        #         team2['gols'][minutes[comments.index(comment)].text] = comment.text
        # elif 'cartão amarelo' in comment.text:
        #     print(comment.text)
        # elif 'cartão vermelho' in comment.text:
        #     print(comment.text)
    

print(get_datas_from_comentarios('699647'))'''

'def get_datas_from_comentarios(id: str) -> dict | None:\n    \n\n    if type(id) != str:\n        sys_message(\'ID must be a string\')\n        return None\n\n    url = f\'https://www.espn.com.br/futebol/comentario/_/jogoId/{id}\'\n    page = get_soup(url)\n\n    if page is None:\n        return None\n    elif not page.find(\'h1\', class_=\'Error404__Title\') is None:\n        sys_message(\'Game not found\')\n        return None\n    \n    team1 = {}\n    team2 = {}\n    \n    # Finding team names\n    aux = page.find_all(\'h2\', class_=\'ScoreCell__TeamName ScoreCell__TeamName--displayName db\')\n    aux = [name.text for name in aux]\n    team1[\'name\'], team2[\'name\'] = aux[0], aux[1]\n    print(team1[\'name\'], team2[\'name\'])\n\n    # Getting comments\n    minutes = page.find_all(\'div\', class_=\'MatchCommentary__Comment__Timestamp\')\n    comments = page.find_all(\'div\', class_=\'MatchCommentary__Comment__GameDetails\')\n    if len(minutes) != len(comments):\n        sys_mes

In [17]:
def get_datas_from_comentarios(id: str) -> dict | None:
    '''Get the stats from a game by the page Comentarios'''

    if type(id) != str:
        sys_message('ID must be a string')
        return None

    url = f'https://www.espn.com.br/futebol/comentario/_/jogoId/{id}'
    page = verify_page(url)

    # Getting comments
    minutes = page.find_all('div', class_='MatchCommentary__Comment__Timestamp')
    comments = page.find_all('div', class_='MatchCommentary__Comment__GameDetails')
    if len(minutes) != len(comments):
        sys_message('Data inconsistency')
        return None
    
    bids = {} # Dictionary to store the data from comments

    for comment, minute in zip(comments, minutes):
        index = len(bids)
        data = get_data_from_comment(index, comment.text, minute.text)
        if data is not None:
            bids.update(data)
        # else:
        #     print(comment.text)
        
    return bids

In [18]:
result = pd.DataFrame(get_datas_from_comentarios('699647'))
result.head(6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,117,118,119,120,121,122,123,124,125,126
jogador-1,,Jean Carlos,,Danilo Boza,,Gustavo Gómez,Jean Carlos,,Jean Carlos,Ewerthon,...,,,Richard Ríos,Richard Ríos,,,José López,,Ronie Carrillo,
jogador-2,,,Fabinho,,Kaiky Naves,José López,,Raphael Veiga,,,...,Zé Marcos,Mandaca,,,Alan Ruschel,João Lucas,,Gustavo Gómez,,Jádson
time,,Juventude,Palmeiras,Juventude,Juventude,Palmeiras,Juventude,Palmeiras,Juventude,Juventude,...,Palmeiras,Juventude,Palmeiras,Palmeiras,Juventude,Juventude,Palmeiras,Palmeiras,Juventude,Palmeiras
tipo,ENCERRAMENTO-2,FALTA-FEITA,FALTA-SOFRIDA,GOL-PERDIDO,ESCANTEIO,IMPEDIMENTO,CARTAO-AMARELO,FALTA-SOFRIDA,FALTA-FEITA,GOL-PERDIDO,...,ESCANTEIO,FALTA-SOFRIDA,FALTA-FEITA,FALTA-FEITA,FALTA-SOFRIDA,FALTA-SOFRIDA,FALTA-FEITA,FALTA-SOFRIDA,FALTA-FEITA,ESCANTEIO
descricao,,,,CABECEIO,,,por uma entrada perigosa,,,CHUTE (pé direito),...,,,,,,,,,,
minuto,90'+7',90'+7',90'+7',90'+6',90'+6',90'+5',90'+4',90'+4',90'+4',90'+3',...,10',7',7',5',5',5',5',4',4',2'


In [19]:
# def get_lineup(id: str) -> None:
#     '''AnchorLink SoccerLineUpPlayer__Header__Name'''
#     if type(id) != str:
#         sys_message('ID must be a string')
#         return None

#     url = f'https://www.espn.com.br/futebol/escalacoes/_/jogoId/{id}'
#     page = get_soup(url)

#     if page is None:
#         return None
#     elif not page.find('h1', class_='Error404__Title') is None:
#         sys_message('Game not found')
#         return None
    
#     aux = list(page.find_all('div', class_='ResponsiveTable LineUps__PlayersTable'))
#     team1_component = aux[0]
#     team2_component = aux[1]

#     aux = list(team1_component.find_all('a', class_='AnchorLink SoccerLineUpPlayer__Header__Name'))
#     team1_lineUp = [name.text for name in aux]
#     aux = list(team2_component.find_all('a', class_='AnchorLink SoccerLineUpPlayer__Header__Name'))
#     team2_lineUp = [name.text for name in aux]

#     aux = list(page.find_all('div', class_='ResponsiveTable LineUps__SubstitutesTable'))
#     return aux

# result = get_lineup('699647')
# print(len(result))
# print(result)

In [20]:
def get_lineup(id: str) -> dict | None:

    if type(id) != str:
        sys_message('ID must be a string')
        return None

    url = f'https://www.espn.com.br/futebol/escalacoes/_/jogoId/{id}'
    page = verify_page(url)
    
    substitute = 'SoccerLineUpPlayer__Header SoccerLineUpPlayer__Header--subbedIn' # div
    starting_player = 'SoccerLineUpPlayer__Header' # div
    class_names = 'AnchorLink SoccerLineUpPlayer__Header__Name' # a

    # Getting the table with the components of the page
    aux = list(page.find_all('div', class_='ResponsiveTable LineUps__PlayersTable'))
    team1_component = aux[0]
    team2_component = aux[1]

    # Getting the names of the sunstitute players
    team1_substitute, team2_substitute = [], []
    substitutes1 = team1_component.find_all('div', class_=substitute)
    substitutes2 = team2_component.find_all('div', class_=substitute)
    for name1, name2 in zip(substitutes1, substitutes2):
        player1 = name1.find('a', class_=class_names)
        player2 = name2.find('a', class_=class_names)
        team1_substitute.append(player1.text)
        team2_substitute.append(player2.text)
    
    # Getting the names of the starting players
    team1_starting, team2_starting = [], []
    startings1 = team1_component.find_all('div', class_=starting_player)
    startings2 = team2_component.find_all('div', class_=starting_player)
    for name1, name2 in zip(startings1, startings2):
        player1 = name1.find('a', class_=class_names).text
        player2 = name2.find('a', class_=class_names).text
        if player1 not in team1_substitute:
            team1_starting.append(player1)
        if player2 not in team2_substitute:
            team2_starting.append(player2)
    
    # Getting the names of the reserve players
    team1_reserve, team2_reserve = [], []
    aux = page.find_all('div', class_='ResponsiveTable LineUps__SubstitutesTable')
    team1_component = aux[0]
    team2_component = aux[1]

    aux = team1_component.find_all('a', class_=class_names)
    team1_reserve = [name.text for name in aux]
    aux = team2_component.find_all('a', class_=class_names)
    team2_reserve = [name.text for name in aux]

    # Finding team names
    aux = page.find_all('h2', class_='ScoreCell__TeamName ScoreCell__TeamName--displayName db')
    aux = [name.text for name in aux]
    
    team1, team2 = aux[0], aux[1]

    datas = {
        team1: {
            'titulares': team1_starting,
            'substitutos': team1_substitute,
            'reservas': team1_reserve
        },
        team2: {
            'titulares': team2_starting,
            'substitutos': team2_substitute,
            'reservas': team2_reserve
        }
    }
    return datas

In [21]:
pd.DataFrame(get_lineup('699647')).head(6)

Unnamed: 0,Juventude,Palmeiras
titulares,"[Claus, Ze Marcos, Danilo Boza, Alan Ruschel, ...","[Weverton, Gustavo Gómez, Vitor Nunes, Caio Pa..."
substitutos,"[Jean, Marcelinho, Gabriel Taliari, Erick, Ewe...","[Kaiky Naves, Mayke, Zé Rafael, Fabinho, Vande..."
reservas,"[Dudu, Lucas Wingert, Davi Goes Silva Ferreira...","[Marcelo Lomba, Lázaro, Rony, Dudu, Romulo, Mi..."


In [22]:
'''Pegar os ids dos times pela tabela: https://www.espn.com.br/futebol/classificacao/_/liga/BRA.1/temporada/2024'''
'''Pegar as informações dos jogadores: https://www.espn.com.br/futebol/time/elenco/_/id/819/liga/BRA.1/temporada/2024'''


'Pegar as informações dos jogadores: https://www.espn.com.br/futebol/time/elenco/_/id/819/liga/BRA.1/temporada/2024'

In [23]:
def get_teams_id(temporada: int = 2024) -> dict | None:
    '''Get the IDs of the teams from the table'''

    url = f'https://www.espn.com.br/futebol/classificacao/_/liga/BRA.1/temporada/{temporada}'
    page = verify_page(url)

    aux = page.find_all('span', class_='hide-mobile')
    id = {}
    for element in aux:
        id[element.text] = element.find('a')['href'].replace('/futebol/time/_/id/','')

    return id

In [24]:
print(get_teams_id())

{'Botafogo': '6086/botafogo', 'Palmeiras': '2029/palmeiras', 'Fortaleza': '6272/fortaleza', 'Flamengo': '819/flamengo', 'Internacional': '1936/internacional', 'São Paulo': '2026/sao-paulo', 'Bahia': '9967/bahia', 'Cruzeiro': '2022/cruzeiro', 'Atlético-MG': '7632/atletico-mg', 'Vasco da Gama': '3454/vasco-da-gama', 'Grêmio': '6273/gremio', 'Criciúma': '9971/criciuma', 'Fluminense': '3445/fluminense', 'Vitória': '3457/vitoria', 'Athletico-PR': '3458/athletico-pr', 'Red Bull Bragantino': '6079/red-bull-bragantino', 'Juventude': '6270/juventude', 'Corinthians': '874/corinthians', 'Cuiabá': '17313/cuiaba', 'Atlético-GO': '10357/atletico-go'}


In [37]:
def get_cast(id: str, temporada: int = 2024) -> dict | None:
    '''Get the cast of the team'''

    url = f'https://www.espn.com.br/futebol/time/elenco/_/id/{id}/liga/BRA.1/temporada/{temporada}'
    page = verify_page(url)

    if page is None:
        return None
    elif not page.find('h1', class_='Error404__Title') is None:
        sys_message('Team not found')
        return None

    line_class = 'Table__TR Table__TR--sm Table__even' # tr
    column_class = 'Table__TD' # td
    class_names = 'AnchorLink' # a

    tags = {'A': 'ATACANTE', 'G': 'GOLEIRO', 'D': 'DEFENSOR', 'M':'MEIO-CAMPO'}
    
    players = {}
    table = page.find_all('tr', class_=line_class)
    for line in table:
        player = line.find('a', class_=class_names).text
        columns = line.find_all('td', class_=column_class)
        players[player] = {
            'posicao': tags[columns[1].text] if columns[1].text in tags else np.nan,
            'idade': columns[2].text if columns[2].text != '--' else np.nan,
            'altura': columns[3].text if columns[3].text != '--' else np.nan,
            'nascionalidade': columns[5].text if columns[5].text != '--' else np.nan,
        }

    return players

In [38]:
pd.DataFrame(get_cast('874')).head()

Unnamed: 0,Felipe Longo,Hugo Souza,Matheus Donelli,Cadu,Matheus Corrêa,Mateuzinho,Félix Torres,Caetano,André Ramalho,Diego Palacios,...,Pedro Henrique,Giovane,Talles Magno,Pedro Raúl,Kayke Ferrari,Guilherme Inácio,Guilherme Henrique De Oliveira Morais,Luiz Fernando,Kauã,Memphis Depay
posicao,GOLEIRO,GOLEIRO,GOLEIRO,GOLEIRO,GOLEIRO,DEFENSOR,DEFENSOR,DEFENSOR,DEFENSOR,DEFENSOR,...,ATACANTE,ATACANTE,ATACANTE,ATACANTE,ATACANTE,ATACANTE,ATACANTE,ATACANTE,ATACANTE,ATACANTE
idade,19,25,22,19,18,24,27,25,32,25,...,34,20,22,27,20,17,17,17,17,30
altura,1.88 m,1.98 m,1.88 m,1.93 m,1.88 m,1.75 m,1.88 m,1.83 m,1.83 m,1.7 m,...,1.8 m,1.83 m,1.85 m,1.93 m,,,1.73 m,,,1.78 m
nascionalidade,Brasil,Brasil,Brasil,Brasil,Brasil,Brasil,Equador,Brasil,Brasil,Equador,...,Brasil,Brasil,Brasil,Brasil,Brasil,Brasil,Brasil,Brasil,Brasil,Holanda
