In [225]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
import requests
from itertools import chain
from tqdm import tqdm_notebook as tqdm
import re
from datetime import datetime
from collections import defaultdict
import json
from fuzzywuzzy import fuzz

In [137]:
def parse_bat_table(tbl):
    columns = tbl.find_all('th')
    columns = [col.text for col in columns]
    rows = tbl.find('tbody').find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        classes = cols[0].get("class")
        if classes is not None and "batsman-cell" in classes:
            data.append([col.text for col in cols])
    # Append tfoot and check for did not bat.
    did_not_bat = tbl.find('tfoot').find_all('tr')[1].text.split('Did not bat:')[1].strip('').split(',')
    [data.append([bat, 'not out', None, None, None, None, None, None])for bat in did_not_bat]
    
    # After parsing into a dataframe, clean and return it.
    df =  pd.DataFrame(data=data, columns=columns)
    fielding_description = df['\xa0']
    del df['\xa0']
    df = df.apply(lambda x: clean_column(x), axis=0)
    df['is_not_out'] = fielding_description.apply(lambda x:  True if pd.isna(x)!=True and 'not out' in x else False)
    df["fielding_description"] = fielding_description
    return df

In [149]:
def getBallsFaced(overs):
    numBalls = 0
    if '.' in overs:
        overs, balls = overs.split('.')
        numBalls += int(overs) * 6
        numBalls+= int(balls)
        return numBalls
    return int(overs) * 6

In [157]:
def parse_bowl_table(tbl, player_list):
    columns = tbl.find_all('th')
    columns = [col.text for col in columns]
    rows = tbl.find('tbody').find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        data.append([col.text for col in cols])
    bowl_df =  pd.DataFrame(data=data, columns=columns)
    diff_players = list(set(player_list).difference(set(bowl_df['BOWLING'])))
    no_bowl = []
    [no_bowl.append([player, None, None, None, None, None, None, None, None, None, None]) for player in diff_players]
    no_bowl = pd.DataFrame(data = no_bowl, columns = columns)
    df =  pd.concat([bowl_df, no_bowl])
    df = df.apply(lambda x: clean_column(x), axis=0)
    df["ballsBowled"] = df["O"].apply(lambda x: getBallsFaced(x))
    return df

In [151]:
def clean_column(col):
    col = col.apply(lambda x: x.split('(')[0].strip() if pd.isna(x)!=True else x)
    col = col.apply(lambda x: re.sub('[^A-Za-z0-9. ]+', '', x) if pd.isna(x)!=True else x)
    col = col.fillna('0')
    return col

In [227]:
def search_player(queryName, threshold, player_list):
    ratios = [fuzz.partial_ratio(player, queryName) for player in player_list]
    if max(ratios) < threshold:
        return "NA"
    return player_list[ratios.index(max(ratios))]

In [218]:
def scrape_match_header(soup):
    header = soup.find('div', class_='match-header')
    mom = re.sub('[^A-Za-z0-9. ]+', '', header.find('div', class_='best-player-name').text)
    match_description = header.find('div', class_='description').text
    desc_splits = match_description.split(',')
    teams = header.find_all('div', class_='team')
    if len(teams) == 2:
        matchName = ' vs '.join([team.find('p').text for team in teams])
    else:
        matchName = ''
    if len(desc_splits) >= 3:
        return {
            "mom": mom,
            "matchId": re.sub('[^0-9]', '', desc_splits[0]),
            "matchName": matchName,
            "matchDate": datetime.strptime(desc_splits[2].strip(), '%b %d %Y').strftime('%Y-%m-%d')
        }
    else:
        return {
            "mom": mom,
            "matchId": None,
            "matchName": matchName,
            "matchDate": None
        }

In [254]:
def parse_fielding_stats(stat_description, player_list):
    catches_dict = defaultdict()
    stumping_dict = defaultdict()
    runout_dict = defaultdict()
    
    for stat in stat_description.values:
        if 'c & b' in stat:
            name = re.sub('[^A-Za-z ]+', '', stat.split('c & b')[1].strip())
            full_name = search_player(name, 90, player_list)
            catches_dict[full_name] = catches_dict.get(full_name, 0) + 1
        if 'c ' in stat:
            name = re.sub('[^A-Za-z ]+', '', stat.split('c')[1].split('b')[0].strip())
            full_name = search_player(name, 90, player_list)
            catches_dict[full_name] = catches_dict.get(full_name, 0) + 1
        if 'st ' in stat:
            name = re.sub('[^A-Za-z ]+', '', stat.split('st')[1].split('b')[0].strip())
            full_name = search_player(name, 90, player_list)
            stumping_dict[full_name] = stumping_dict.get(full_name, 0) + 1
        if 'run out' in stat:
            names = stat.split('run out')[1].split('/')
            for name in names:
                name = re.sub('[^A-Za-z ]+', '', name).strip()
                full_name = search_player(name, 90, player_list)
                runout_dict[full_name] = runout_dict.get(full_name, 0) + 1
    
    return catches_dict, stumping_dict, runout_dict

In [213]:
def format_stats(df, header, ct_d, st_d, ro_d):
    result = {}
    result["matchId"] = header["matchId"]
    result["matchName"] = header["matchName"]
    result["matchDate"] = header["matchDate"]
    playerStatList = []
    for _, row in df.iterrows():
        r = json.loads(row.to_json())
        playerStat = {}
        playerStat["playerName"] = r["player_name"]
        
        # Formatting Batting Stat.
        
        playerStat["battingStats"] = {
            "runs": int(r["R_bat"]),
            "ballsFaced": int(r["B"]),
            "noOfFours" : int(r["4s_bat"]),
            "noOfSixes": int(r["6s_bat"])
        }
        
        # Formatting Bowling Stat
        playerStat["bowlingStats"] = {
            "noOfWickets": int(r["W"]),
            "ballsBowled": int(r["ballsBowled"]),
            "dots": int(r["0s"]),
            "runsConceded": int(r["R_bowl"]),
            "maidens": int(r["M_bowl"])
        }
        
        # Formatting fielding stat.
        playerStat["fieldingStat"] = {
            "catches": int(ct_d.get(r["player_name"], 0)),
            "stumpings": int(st_d.get(r["player_name"], 0)),
            "runOuts": int(ro_d.get(r["player_name"], 0))
        }
        
        # Is Man of the match 
        playerStat["isMOM"] = True if header["mom"] == r["player_name"] else False
        playerStatList.append(playerStat)
    
    result["playerStatList"] = playerStatList
    return result
        
        
        

In [253]:
def scrape_score_table(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    headers = scrape_match_header(soup)
    batsman_tables = soup.find_all('table', class_='batsman')
    bowling_tables = soup.find_all('table', class_='bowler')
    team1_bat = parse_bat_table(batsman_tables[0])
    team2_bat = parse_bat_table(batsman_tables[1])
    team1_bowl = parse_bowl_table(bowling_tables[1], team1_bat['BATTING'])
    team2_bowl = parse_bowl_table(bowling_tables[0], team2_bat['BATTING'])
    team1_merged = team1_bat.merge(team1_bowl, left_on='BATTING', right_on='BOWLING', how='inner', suffixes=('_bat', '_bowl') )
    del team1_merged['BOWLING']
    team1_merged.rename(columns={'BATTING': 'player_name'}, inplace=True)
    team2_merged = team2_bat.merge(team2_bowl, left_on='BATTING', right_on='BOWLING', how='inner', suffixes=('_bat', '_bowl') )
    del team2_merged['BOWLING']
    team2_merged.rename(columns = {'BATTING': 'player_name'}, inplace=True)
    concated_df =  pd.concat([team1_merged, team2_merged])
    catches_dict, stumping_dict, runout_dict = parse_fielding_stats(concated_df['fielding_description'], list(concated_df["player_name"]))
    stats = format_stats(concated_df, headers, catches_dict, stumping_dict, runout_dict)
    return stats

In [237]:
URL1 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/punjab-kings-vs-delhi-capitals-29th-match-1254086/full-scorecard'
URL2 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/rajasthan-royals-vs-sunrisers-hyderabad-28th-match-1254085/full-scorecard'
URL3 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/punjab-kings-vs-royal-challengers-bangalore-26th-match-1254083/full-scorecard'
URL4 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/mumbai-indians-vs-rajasthan-royals-24th-match-1254081/full-scorecard'

In [255]:
scrape_score_table(URL1)

{'matchId': '29',
 'matchName': 'Punjab Kings vs Delhi Capitals',
 'matchDate': '2021-05-02',
 'playerStatList': [{'playerName': 'Prabhsimran Singh',
   'battingStats': {'runs': 12,
    'ballsFaced': 16,
    'noOfFours': 0,
    'noOfSixes': 1},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 0, 'stumpings': 0, 'runOuts': 0},
   'isMOM': False},
  {'playerName': 'Mayank Agarwal',
   'battingStats': {'runs': 99,
    'ballsFaced': 58,
    'noOfFours': 8,
    'noOfSixes': 4},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 1, 'stumpings': 0, 'runOuts': 0},
   'isMOM': True},
  {'playerName': 'Chris Gayle',
   'battingStats': {'runs': 13,
    'ballsFaced': 9,
    'noOfFours': 1,
    'noOfSixes': 1},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0

In [256]:
scrape_score_table(URL2)

{'matchId': '28',
 'matchName': 'Rajasthan Royals vs Sunrisers Hyderabad',
 'matchDate': '2021-05-02',
 'playerStatList': [{'playerName': 'Jos Buttler',
   'battingStats': {'runs': 124,
    'ballsFaced': 64,
    'noOfFours': 11,
    'noOfSixes': 8},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 0, 'stumpings': 0, 'runOuts': 0},
   'isMOM': True},
  {'playerName': 'Yashasvi Jaiswal',
   'battingStats': {'runs': 12,
    'ballsFaced': 13,
    'noOfFours': 2,
    'noOfSixes': 0},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 0, 'stumpings': 0, 'runOuts': 0},
   'isMOM': False},
  {'playerName': 'Sanju Samson',
   'battingStats': {'runs': 48,
    'ballsFaced': 33,
    'noOfFours': 4,
    'noOfSixes': 2},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsCon

In [257]:
scrape_score_table(URL3)

{'matchId': '26',
 'matchName': 'Punjab Kings vs Royal Challengers Bangalore',
 'matchDate': '2021-04-30',
 'playerStatList': [{'playerName': 'KL Rahul',
   'battingStats': {'runs': 91,
    'ballsFaced': 57,
    'noOfFours': 7,
    'noOfSixes': 5},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 1, 'stumpings': 0, 'runOuts': 0},
   'isMOM': False},
  {'playerName': 'Prabhsimran Singh',
   'battingStats': {'runs': 7,
    'ballsFaced': 7,
    'noOfFours': 1,
    'noOfSixes': 0},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 0, 'stumpings': 0, 'runOuts': 0},
   'isMOM': False},
  {'playerName': 'Chris Gayle',
   'battingStats': {'runs': 46,
    'ballsFaced': 24,
    'noOfFours': 6,
    'noOfSixes': 2},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConce

In [258]:
scrape_score_table(URL4)

{'matchId': '24',
 'matchName': 'Rajasthan Royals vs Mumbai Indians',
 'matchDate': '2021-04-29',
 'playerStatList': [{'playerName': 'Jos Buttler',
   'battingStats': {'runs': 41,
    'ballsFaced': 32,
    'noOfFours': 3,
    'noOfSixes': 3},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 1, 'stumpings': 0, 'runOuts': 0},
   'isMOM': False},
  {'playerName': 'Yashasvi Jaiswal',
   'battingStats': {'runs': 32,
    'ballsFaced': 20,
    'noOfFours': 2,
    'noOfSixes': 2},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded': 0,
    'maidens': 0},
   'fieldingStat': {'catches': 0, 'stumpings': 0, 'runOuts': 0},
   'isMOM': False},
  {'playerName': 'Sanju Samson',
   'battingStats': {'runs': 42,
    'ballsFaced': 27,
    'noOfFours': 5,
    'noOfSixes': 0},
   'bowlingStats': {'noOfWickets': 0,
    'ballsBowled': 0,
    'dots': 0,
    'runsConceded'