In [225]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
import requests
from itertools import chain
from tqdm import tqdm_notebook as tqdm
import re
from datetime import datetime
from collections import defaultdict
import json
from fuzzywuzzy import fuzz

In [137]:
def parse_bat_table(tbl):
    columns = tbl.find_all('th')
    columns = [col.text for col in columns]
    rows = tbl.find('tbody').find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        classes = cols[0].get("class")
        if classes is not None and "batsman-cell" in classes:
            data.append([col.text for col in cols])
    # Append tfoot and check for did not bat.
    did_not_bat = tbl.find('tfoot').find_all('tr')[1].text.split('Did not bat:')[1].strip('').split(',')
    [data.append([bat, 'not out', None, None, None, None, None, None])for bat in did_not_bat]
    
    # After parsing into a dataframe, clean and return it.
    df =  pd.DataFrame(data=data, columns=columns)
    fielding_description = df['\xa0']
    del df['\xa0']
    df = df.apply(lambda x: clean_column(x), axis=0)
    df['is_not_out'] = fielding_description.apply(lambda x:  True if pd.isna(x)!=True and 'not out' in x else False)
    df["fielding_description"] = fielding_description
    return df

In [149]:
def getBallsFaced(overs):
    numBalls = 0
    if '.' in overs:
        overs, balls = overs.split('.')
        numBalls += int(overs) * 6
        numBalls+= int(balls)
        return numBalls
    return int(overs) * 6

In [157]:
def parse_bowl_table(tbl, player_list):
    columns = tbl.find_all('th')
    columns = [col.text for col in columns]
    rows = tbl.find('tbody').find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        data.append([col.text for col in cols])
    bowl_df =  pd.DataFrame(data=data, columns=columns)
    diff_players = list(set(player_list).difference(set(bowl_df['BOWLING'])))
    no_bowl = []
    [no_bowl.append([player, None, None, None, None, None, None, None, None, None, None]) for player in diff_players]
    no_bowl = pd.DataFrame(data = no_bowl, columns = columns)
    df =  pd.concat([bowl_df, no_bowl])
    df = df.apply(lambda x: clean_column(x), axis=0)
    df["ballsBowled"] = df["O"].apply(lambda x: getBallsFaced(x))
    return df

In [151]:
def clean_column(col):
    col = col.apply(lambda x: x.split('(')[0].strip() if pd.isna(x)!=True else x)
    col = col.apply(lambda x: re.sub('[^A-Za-z0-9. ]+', '', x) if pd.isna(x)!=True else x)
    col = col.fillna('0')
    return col

In [227]:
def search_player(queryName, threshold, player_list):
    ratios = [fuzz.partial_ratio(player, queryName) for player in player_list]
    if max(ratios) < threshold:
        return "NA"
    return player_list[ratios.index(max(ratios))]

In [218]:
def scrape_match_header(soup):
    header = soup.find('div', class_='match-header')
    mom = re.sub('[^A-Za-z0-9. ]+', '', header.find('div', class_='best-player-name').text)
    match_description = header.find('div', class_='description').text
    desc_splits = match_description.split(',')
    teams = header.find_all('div', class_='team')
    if len(teams) == 2:
        matchName = ' vs '.join([team.find('p').text for team in teams])
    else:
        matchName = ''
    if len(desc_splits) >= 3:
        return {
            "mom": mom,
            "matchId": re.sub('[^0-9]', '', desc_splits[0]),
            "matchName": matchName,
            "matchDate": datetime.strptime(desc_splits[2].strip(), '%b %d %Y').strftime('%Y-%m-%d')
        }
    else:
        return {
            "mom": mom,
            "matchId": None,
            "matchName": matchName,
            "matchDate": None
        }

In [254]:
def parse_fielding_stats(stat_description, player_list):
    catches_dict = defaultdict()
    stumping_dict = defaultdict()
    runout_dict = defaultdict()
    
    for stat in stat_description.values:
        if 'c & b' in stat:
            name = re.sub('[^A-Za-z ]+', '', stat.split('c & b')[1].strip())
            full_name = search_player(name, 90, player_list)
            catches_dict[full_name] = catches_dict.get(full_name, 0) + 1
        if 'c ' in stat:
            name = re.sub('[^A-Za-z ]+', '', stat.split('c')[1].split('b')[0].strip())
            full_name = search_player(name, 90, player_list)
            catches_dict[full_name] = catches_dict.get(full_name, 0) + 1
        if 'st ' in stat:
            name = re.sub('[^A-Za-z ]+', '', stat.split('st')[1].split('b')[0].strip())
            full_name = search_player(name, 90, player_list)
            stumping_dict[full_name] = stumping_dict.get(full_name, 0) + 1
        if 'run out' in stat:
            names = stat.split('run out')[1].split('/')
            for name in names:
                name = re.sub('[^A-Za-z ]+', '', name).strip()
                full_name = search_player(name, 90, player_list)
                runout_dict[full_name] = runout_dict.get(full_name, 0) + 1
    
    return catches_dict, stumping_dict, runout_dict

In [213]:
def format_stats(df, header, ct_d, st_d, ro_d):
    result = {}
    result["matchId"] = header["matchId"]
    result["matchName"] = header["matchName"]
    result["matchDate"] = header["matchDate"]
    playerStatList = []
    for _, row in df.iterrows():
        r = json.loads(row.to_json())
        playerStat = {}
        playerStat["playerName"] = r["player_name"]
        
        # Formatting Batting Stat.
        
        playerStat["battingStats"] = {
            "runs": int(r["R_bat"]),
            "ballsFaced": int(r["B"]),
            "noOfFours" : int(r["4s_bat"]),
            "noOfSixes": int(r["6s_bat"])
        }
        
        # Formatting Bowling Stat
        playerStat["bowlingStats"] = {
            "noOfWickets": int(r["W"]),
            "ballsBowled": int(r["ballsBowled"]),
            "dots": int(r["0s"]),
            "runsConceded": int(r["R_bowl"]),
            "maidens": int(r["M_bowl"])
        }
        
        # Formatting fielding stat.
        playerStat["fieldingStat"] = {
            "catches": int(ct_d.get(r["player_name"], 0)),
            "stumpings": int(st_d.get(r["player_name"], 0)),
            "runOuts": int(ro_d.get(r["player_name"], 0))
        }
        
        # Is Man of the match 
        playerStat["isMOM"] = True if header["mom"] == r["player_name"] else False
        playerStatList.append(playerStat)
    
    result["playerStatList"] = playerStatList
    return result
        
        
        

In [253]:
def scrape_score_table(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    headers = scrape_match_header(soup)
    batsman_tables = soup.find_all('table', class_='batsman')
    bowling_tables = soup.find_all('table', class_='bowler')
    team1_bat = parse_bat_table(batsman_tables[0])
    team2_bat = parse_bat_table(batsman_tables[1])
    team1_bowl = parse_bowl_table(bowling_tables[1], team1_bat['BATTING'])
    team2_bowl = parse_bowl_table(bowling_tables[0], team2_bat['BATTING'])
    team1_merged = team1_bat.merge(team1_bowl, left_on='BATTING', right_on='BOWLING', how='inner', suffixes=('_bat', '_bowl') )
    del team1_merged['BOWLING']
    team1_merged.rename(columns={'BATTING': 'player_name'}, inplace=True)
    team2_merged = team2_bat.merge(team2_bowl, left_on='BATTING', right_on='BOWLING', how='inner', suffixes=('_bat', '_bowl') )
    del team2_merged['BOWLING']
    team2_merged.rename(columns = {'BATTING': 'player_name'}, inplace=True)
    concated_df =  pd.concat([team1_merged, team2_merged])
    catches_dict, stumping_dict, runout_dict = parse_fielding_stats(concated_df['fielding_description'], list(concated_df["player_name"]))
    stats = format_stats(concated_df, headers, catches_dict, stumping_dict, runout_dict)
    return stats

In [237]:
URL1 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/punjab-kings-vs-delhi-capitals-29th-match-1254086/full-scorecard'
URL2 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/rajasthan-royals-vs-sunrisers-hyderabad-28th-match-1254085/full-scorecard'
URL3 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/punjab-kings-vs-royal-challengers-bangalore-26th-match-1254083/full-scorecard'
URL4 = 'https://www.espncricinfo.com/series/ipl-2021-1249214/mumbai-indians-vs-rajasthan-royals-24th-match-1254081/full-scorecard'

In [250]:
scrape_score_table(URL1)

In [245]:
df

Unnamed: 0,player_name,R_bat,B,M_bat,4s_bat,6s_bat,SR,is_not_out,fielding_description,O,M_bowl,R_bowl,W,ECON,0s,4s_bowl,6s_bowl,WD,NB,ballsBowled
0,Jos Buttler,41,32,,3,3,128.12,False,st †de Kock b Chahar,0.0,0,0,0,0.0,0,0,0,0,0,0
1,Yashasvi Jaiswal,32,20,,2,2,160.0,False,c & b Chahar,0.0,0,0,0,0.0,0,0,0,0,0,0
2,Sanju Samson,42,27,,5,0,155.55,False,b Boult,0.0,0,0,0,0.0,0,0,0,0,0,0
3,Shivam Dube,35,31,,2,2,112.9,False,c & b Bumrah,1.0,0,6,0,6.0,3,0,0,1,0,6
4,David Miller,7,4,,1,0,175.0,True,not out,0.0,0,0,0,0.0,0,0,0,0,0,0
5,Riyan Parag,8,7,,1,0,114.28,True,not out,0.0,0,0,0,0.0,0,0,0,0,0,0
6,Rahul Tewatia,0,0,0.0,0,0,0.0,True,not out,3.0,0,30,0,10.0,4,3,1,1,0,18
7,Chris Morris,0,0,0.0,0,0,0.0,True,not out,4.0,0,33,2,8.25,9,2,2,0,0,24
8,Jaydev Unadkat,0,0,0.0,0,0,0.0,True,not out,4.0,0,33,0,8.25,7,3,1,0,0,24
9,Chetan Sakariya,0,0,0.0,0,0,0.0,True,not out,3.0,0,18,0,6.0,6,1,0,0,0,18


In [134]:
parse_fielding_stats(d['fielding_description'])

[' (Hetmyer', 'Patel)']


(defaultdict(None,
             {'Smith': 1,
              'Hetmyer': 1,
              'Lalit Yadav': 1,
              'Malan': 1,
              'Agarwal': 1}),
 defaultdict(None, {}),
 defaultdict(None, {'Hetmyer': 1, 'Patel': 1}))

In [135]:
d

Unnamed: 0,player_name,R_bat,B,M_bat,4s_bat,6s_bat,SR,is_not_out,fielding_description,O,M_bowl,R_bowl,W,ECON,0s,4s_bowl,6s_bowl,WD,NB
0,Prabhsimran Singh,12,16,,0,1,75.0,False,c Smith b Rabada,0.0,0,0,0,0.0,0,0,0,0,0
1,Mayank Agarwal,99,58,,8,4,170.68,True,not out,0.0,0,0,0,0.0,0,0,0,0,0
2,Chris Gayle,13,9,,1,1,144.44,False,b Rabada,0.0,0,0,0,0.0,0,0,0,0,0
3,Dawid Malan,26,26,,1,1,100.0,False,b Patel,0.0,0,0,0,0.0,0,0,0,0,0
4,Deepak Hooda,1,1,,0,0,100.0,False,run out (Hetmyer/Patel),2.0,0,11,0,5.5,3,0,0,0,0
5,Shahrukh Khan,4,5,,0,0,80.0,False,c Hetmyer b Avesh Khan,0.0,0,0,0,0.0,0,0,0,0,0
6,Chris Jordan,2,3,,0,0,66.66,False,c Lalit Yadav b Rabada,2.0,0,21,1,10.5,4,2,1,0,0
7,Harpreet Brar,4,2,,1,0,200.0,True,not out,3.0,0,19,1,6.33,4,0,0,0,0
8,Riley Meredith,0,0,0.0,0,0,0.0,True,not out,3.4,0,35,1,9.54,7,2,2,3,0
9,Ravi Bishnoi,0,0,0.0,0,0,0.0,True,not out,4.0,0,42,0,10.5,7,3,3,0,0


In [58]:
batting_stats = ['']
for _, row in d.iterrows():
    r = row.to_json()
    

0
{"player_name":"Prabhsimran Singh","R_bat":"12","B":"16","M_bat":"","4s_bat":"0","6s_bat":"1","SR":"75.00","is_not_out":false,"O":"0","M_bowl":"0","R_bowl":"0","W":"0","ECON":"0","0s":"0","4s_bowl":"0","6s_bowl":"0","WD":"0","NB":"0"}
1
{"player_name":"Mayank Agarwal","R_bat":"99","B":"58","M_bat":"","4s_bat":"8","6s_bat":"4","SR":"170.68","is_not_out":true,"O":"0","M_bowl":"0","R_bowl":"0","W":"0","ECON":"0","0s":"0","4s_bowl":"0","6s_bowl":"0","WD":"0","NB":"0"}
2
{"player_name":"Chris Gayle","R_bat":"13","B":"9","M_bat":"","4s_bat":"1","6s_bat":"1","SR":"144.44","is_not_out":false,"O":"0","M_bowl":"0","R_bowl":"0","W":"0","ECON":"0","0s":"0","4s_bowl":"0","6s_bowl":"0","WD":"0","NB":"0"}
3
{"player_name":"Dawid Malan","R_bat":"26","B":"26","M_bat":"","4s_bat":"1","6s_bat":"1","SR":"100.00","is_not_out":false,"O":"0","M_bowl":"0","R_bowl":"0","W":"0","ECON":"0","0s":"0","4s_bowl":"0","6s_bowl":"0","WD":"0","NB":"0"}
4
{"player_name":"Deepak Hooda","R_bat":"1","B":"1","M_bat":"","4s

In [95]:
scrape_score_table(URL2)

{'Jos Buttler': {'R_bat': '124',
  'B': '64',
  'M_bat': '',
  '4s_bat': '11',
  '6s_bat': '8',
  'SR': '193.75',
  'is_not_out': False,
  'O': '0',
  'M_bowl': '0',
  'R_bowl': '0',
  'W': '0',
  'ECON': '0',
  '0s': '0',
  '4s_bowl': '0',
  '6s_bowl': '0',
  'WD': '0',
  'NB': '0'},
 'Yashasvi Jaiswal': {'R_bat': '12',
  'B': '13',
  'M_bat': '',
  '4s_bat': '2',
  '6s_bat': '0',
  'SR': '92.30',
  'is_not_out': False,
  'O': '0',
  'M_bowl': '0',
  'R_bowl': '0',
  'W': '0',
  'ECON': '0',
  '0s': '0',
  '4s_bowl': '0',
  '6s_bowl': '0',
  'WD': '0',
  'NB': '0'},
 'Sanju Samson': {'R_bat': '48',
  'B': '33',
  'M_bat': '',
  '4s_bat': '4',
  '6s_bat': '2',
  'SR': '145.45',
  'is_not_out': False,
  'O': '0',
  'M_bowl': '0',
  'R_bowl': '0',
  'W': '0',
  'ECON': '0',
  '0s': '0',
  '4s_bowl': '0',
  '6s_bowl': '0',
  'WD': '0',
  'NB': '0'},
 'Riyan Parag': {'R_bat': '15',
  'B': '8',
  'M_bat': '',
  '4s_bat': '0',
  '6s_bat': '1',
  'SR': '187.50',
  'is_not_out': True,
  'O': '

In [96]:
scrape_score_table(URL3)

{'KL Rahul': {'R_bat': '91',
  'B': '57',
  'M_bat': '',
  '4s_bat': '7',
  '6s_bat': '5',
  'SR': '159.64',
  'is_not_out': True,
  'O': '0',
  'M_bowl': '0',
  'R_bowl': '0',
  'W': '0',
  'ECON': '0',
  '0s': '0',
  '4s_bowl': '0',
  '6s_bowl': '0',
  'WD': '0',
  'NB': '0'},
 'Prabhsimran Singh': {'R_bat': '7',
  'B': '7',
  'M_bat': '',
  '4s_bat': '1',
  '6s_bat': '0',
  'SR': '100.00',
  'is_not_out': False,
  'O': '0',
  'M_bowl': '0',
  'R_bowl': '0',
  'W': '0',
  'ECON': '0',
  '0s': '0',
  '4s_bowl': '0',
  '6s_bowl': '0',
  'WD': '0',
  'NB': '0'},
 'Chris Gayle': {'R_bat': '46',
  'B': '24',
  'M_bat': '',
  '4s_bat': '6',
  '6s_bat': '2',
  'SR': '191.66',
  'is_not_out': False,
  'O': '0',
  'M_bowl': '0',
  'R_bowl': '0',
  'W': '0',
  'ECON': '0',
  '0s': '0',
  '4s_bowl': '0',
  '6s_bowl': '0',
  'WD': '0',
  'NB': '0'},
 'Nicholas Pooran': {'R_bat': '0',
  'B': '3',
  'M_bat': '',
  '4s_bat': '0',
  '6s_bat': '0',
  'SR': '0.00',
  'is_not_out': False,
  'O': '0',
 