In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [7]:
# Function to clean text (remove extra spaces, newlines, etc.)
def clean_text(text):
    return ' '.join(text.strip().split())

In [8]:
# Function to extract overs in balls for legal balls calculation
def overs_to_balls(overs_str):
    if '.' in overs_str:
        overs, balls = map(int, overs_str.split('.'))
        return overs * 6 + balls
    return int(overs_str) * 6

In [None]:
if response.status_code != 200:
        raise Exception(f"Failed to fetch the page: {response.status_code}")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all scorecard tables (usually 2 for batting, 2 for bowling)
    scorecard_tables = soup.find_all('div', class_='ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table')
    bowling_tables = soup.find_all('div', class_='ds-w-full ds-table ds-table-xs')
    
    # Lists to store data
    all_players = []
    
    # Teams (assuming the first table is for Team 1 batting, second for Team 2)
    teams = ['Team 1', 'Team 2']  # We'll update these with actual team names
    
    # Extract team names from the page (usually in the title or header)
    match_title = soup.find('h1', class_='ds-text-title-l ds-font-bold ds-mb-2')
    if match_title:
        title_text = match_title.text
        team_names = title_text.split(' vs ')
        if len(team_names) >= 2:
            teams = [team_names[0].strip(), team_names[1].split(',')[0].strip()]
    
    # Process batting tables
    for idx, table in enumerate(scorecard_tables):
        team_name = teams[idx]
        rows = table.find_all('tr')
        
        # Track players who batted
        batted_players = set()
        
        for row in rows:
            cols = row.find_all('td')
            if len(cols) < 2:
                continue
                
            # Extract player name
            player_link = cols[0].find('a')
            if not player_link:
                continue
            player_name = clean_text(player_link.text)
            batted_players.add(player_name)
            
            # Initialize player data
            player_data = {
                'Playername': player_name,
                'Team': team_name,
                'Batted': 'Yes',
                'Bowled': 'No',  # Will update later if they bowled
                'TotalRuns_bat': 0,
                'Balls_bats': 0,
                'Outs_bat': 0,
                'FiftyPlusRun_bat': 0,
                'Century_bat': 0,
                'Catches': 0,
                'Stumping': 0,
                'Fours_Bat': 0,
                'Sixes_Bat': 0,
                'StrikeRate_Bat': 0.0,
                'LegalBallsBowled_bowl': 0,
                'TotalRunsConceded_bowl': 0,
                'DotBallsBowled_bowl': 0,
                'ScoringBallsBowled': 0,
                'EconomyRate': 0.0,
                'OversBowled': 0.0,
                'NobollsBowled': 0,
                'Wides_bowl': 0,
                'InningsWickets': 0,
                'Maidens_bowl': 0,
                'FourWickets': 0,
                'FiveWickets': 0,
                'Fours_Bowl': 0,
                'Sixes_Bowl': 0,
                'Innings_Bowl': 0
            }
            
            # Extract batting stats
            runs = cols[2].text.strip()
            player_data['TotalRuns_bat'] = int(re.search(r'\d+', runs).group()) if re.search(r'\d+', runs) else 0
            player_data['Balls_bats'] = int(cols[3].text.strip()) if cols[3].text.strip().isdigit() else 0
            player_data['Fours_Bat'] = int(cols[4].text.strip()) if cols[4].text.strip().isdigit() else 0
            player_data['Sixes_Bat'] = int(cols[5].text.strip()) if cols[5].text.strip().isdigit() else 0
            player_data['StrikeRate_Bat'] = float(cols[6].text.strip()) if cols[6].text.strip().replace('.', '').isdigit() else 0.0
            
            # Determine if player was out
            dismissal = cols[1].text.strip()
            player_data['Outs_bat'] = 0 if 'not out' in dismissal.lower() else 1
            
            # Fifty or Century
            if player_data['TotalRuns_bat'] >= 100:
                player_data['Century_bat'] = 1
                player_data['FiftyPlusRun_bat'] = 1
            elif player_data['TotalRuns_bat'] >= 50:
                player_data['FiftyPlusRun_bat'] = 1
                
            # Extract catches/stumpings from dismissal text
            if 'c ' in dismissal and 'b ' in dismissal:
                catcher = dismissal.split('c ')[1].split(' b ')[0].strip()
                if '(' in catcher:
                    catcher = catcher.split('(')[0].strip()
                # We'll update the catcher's stats later
            elif 'st ' in dismissal and 'b ' in dismissal:
                stumper = dismissal.split('st ')[1].split(' b ')[0].strip()
                if '(' in stumper:
                    stumper = stumper.split('(')[0].strip()
                # We'll update the stumper's stats later
            
            all_players.append(player_data)
        
        # Extract "Did Not Bat" players
        dnb_section = table.find_next('div', class_='ds-text-tight-s ds-font-regular ds-mb-1 ds-py-1')
        if dnb_section:
            dnb_players = dnb_section.find_all('a')
            for player in dnb_players:
                player_name = clean_text(player.text)
                if player_name in batted_players:
                    continue
                player_data = {
                    'Playername': player_name,
                    'Team': team_name,
                    'Batted': 'No',
                    'Bowled': 'No',
                    'TotalRuns_bat': 0,
                    'Balls_bats': 0,
                    'Outs_bat': 0,
                    'FiftyPlusRun_bat': 0,
                    'Century_bat': 0,
                    'Catches': 0,
                    'Stumping': 0,
                    'Fours_Bat': 0,
                    'Sixes_Bat': 0,
                    'StrikeRate_Bat': 0.0,
                    'LegalBallsBowled_bowl': 0,
                    'TotalRunsConceded_bowl': 0,
                    'DotBallsBowled_bowl': 0,
                    'ScoringBallsBowled': 0,
                    'EconomyRate': 0.0,
                    'OversBowled': 0.0,
                    'NobollsBowled': 0,
                    'Wides_bowl': 0,
                    'InningsWickets': 0,
                    'Maidens_bowl': 0,
                    'FourWickets': 0,
                    'FiveWickets': 0,
                    'Fours_Bowl': 0,
                    'Sixes_Bowl': 0,
                    'Innings_Bowl': 0
                }
                all_players.append(player_data)
    
    # Process bowling tables (reverse order: Team 2 bowls first, then Team 1)
    bowling_team_indices = [1, 0]  # Team 2 bowls in first innings, Team 1 in second
    for idx, table in enumerate(bowling_tables[:2]):  # Only take the first two bowling tables
        team_name = teams[bowling_team_indices[idx]]
        rows = table.find_all('tr')
        
        for row in rows:
            cols = row.find_all('td')
            if len(cols) < 2:
                continue
                
            # Extract bowler name
            player_link = cols[0].find('a')
            if not player_link:
                continue
            player_name = clean_text(player_link.text)
            
            # Check if player already exists in the list
            player_data = next((p for p in all_players if p['Playername'] == player_name and p['Team'] == team_name), None)
            if not player_data:
                player_data = {
                    'Playername': player_name,
                    'Team': team_name,
                    'Batted': 'No',
                    'Bowled': 'Yes',
                    'TotalRuns_bat': 0,
                    'Balls_bats': 0,
                    'Outs_bat': 0,
                    'FiftyPlusRun_bat': 0,
                    'Century_bat': 0,
                    'Catches': 0,
                    'Stumping': 0,
                    'Fours_Bat': 0,
                    'Sixes_Bat': 0,
                    'StrikeRate_Bat': 0.0,
                    'LegalBallsBowled_bowl': 0,
                    'TotalRunsConceded_bowl': 0,
                    'DotBallsBowled_bowl': 0,
                    'ScoringBallsBowled': 0,
                    'EconomyRate': 0.0,
                    'OversBowled': 0.0,
                    'NobollsBowled': 0,
                    'Wides_bowl': 0,
                    'InningsWickets': 0,
                    'Maidens_bowl': 0,
                    'FourWickets': 0,
                    'FiveWickets': 0,
                    'Fours_Bowl': 0,
                    'Sixes_Bowl': 0,
                    'Innings_Bowl': 1
                }
                all_players.append(player_data)
            else:
                player_data['Bowled'] = 'Yes'
                player_data['Innings_Bowl'] = 1
            
            # Extract bowling stats
            overs = cols[1].text.strip()
            player_data['OversBowled'] = float(overs) if overs.replace('.', '').isdigit() else 0.0
            player_data['Maidens_bowl'] = int(cols[2].text.strip()) if cols[2].text.strip().isdigit() else 0
            player_data['TotalRunsConceded_bowl'] = int(cols[3].text.strip()) if cols[3].text.strip().isdigit() else 0
            player_data['InningsWickets'] = int(cols[4].text.strip()) if cols[4].text.strip().isdigit() else 0
            player_data['Wides_bowl'] = int(cols[5].text.strip()) if cols[5].text.strip().isdigit() else 0
            player_data['NobollsBowled'] = int(cols[6].text.strip()) if cols[6].text.strip().isdigit() else 0
            player_data['EconomyRate'] = float(cols[7].text.strip()) if cols[7].text.strip().replace('.', '').isdigit() else 0.0
            
            # Calculate LegalBallsBowled_bowl
            total_balls = overs_to_balls(overs)
            player_data['LegalBallsBowled_bowl'] = total_balls - player_data['Wides_bowl'] - player_data['NobollsBowled']
            
            # FourWickets and FiveWickets
            if player_data['InningsWickets'] >= 5:
                player_data['FiveWickets'] = 1
                player_data['FourWickets'] = 1
            elif player_data['InningsWickets'] >= 4:
                player_data['FourWickets'] = 1
    
    # Update catches and stumpings
    for idx, table in enumerate(scorecard_tables):
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) < 2:
                continue
            player_link = cols[0].find('a')
            if not player_link:
                continue
            batsman = clean_text(player_link.text)
            dismissal = cols[1].text.strip()
            
            # Extract catches
            if 'c ' in dismissal and 'b ' in dismissal:
                catcher = dismissal.split('c ')[1].split(' b ')[0].strip()
                if '(' in catcher:
                    catcher = catcher.split('(')[0].strip()
                for player in all_players:
                    if player['Playername'] == catcher:
                        player['Catches'] += 1
            
            # Extract stumpings
            if 'st ' in dismissal and 'b ' in dismissal:
                stumper = dismissal.split('st ')[1].split(' b ')[0].strip()
                if '(' in stumper:
                    stumper = stumper.split('(')[0].strip()
                for player in all_players:
                    if player['Playername'] == stumper:
                        player['Stumping'] += 1
    
    return all_players

In [11]:
# Function to scrape the scorecard and extract data
def scrape_cricinfo_scorecard(match_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Fetch the webpage
    response = requests.get(match_url, headers=headers)
    

In [13]:
# Main function to run the scraper and save to CSV
def main():
    # Replace with the actual match URL
    match_url = 'https://www.espncricinfo.com/series/ipl-2024-1410320/kolkata-knight-riders-vs-royal-challengers-bengaluru-36th-match-1426272/full-scorecard'
    
    try:
        players_data = scrape_cricinfo_scorecard(match_url)
        
        # # Convert to DataFrame
        # df = pd.DataFrame(players_data)
        
        # # Reorder columns to match your CSV
        # columns = [
        #     'Playername', 'Team', 'Batted', 'Bowled', 'TotalRuns_bat', 'Balls_bats', 'Outs_bat',
        #     'FiftyPlusRun_bat', 'Century_bat', 'Catches', 'Stumping', 'Fours_Bat', 'Sixes_Bat',
        #     'StrikeRate_Bat', 'LegalBallsBowled_bowl', 'TotalRunsConceded_bowl', 'DotBallsBowled_bowl',
        #     'ScoringBallsBowled', 'EconomyRate', 'OversBowled', 'NobollsBowled', 'Wides_bowl',
        #     'InningsWickets', 'Maidens_bowl', 'FourWickets', 'FiveWickets', 'Fours_Bowl', 'Sixes_Bowl',
        #     'Innings_Bowl'
        # ]
        # df = df[columns]
        
        # # Save to CSV
        # df.to_csv('kkr_vs_rcb_players.csv', index=False)
        # print("Data successfully saved to 'kkr_vs_rcb_players.csv'")
        
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

An error occurred: Failed to fetch the page: 403
