This Notebook is used to demonstrate a Python script that extracts indian-premier-league data from a https://www.espncricinfo.com/ using web scraping techniques. 
It leverages the requests library to fetch HTML content and BeautifulSoup to parse and extract the desired information.
At the end we have download the fetched data as a CSV file.

Click [link](https://www.espncricinfo.com/records/trophy/team-match-results/indian-premier-league-117) to visit the source page(Official ESPN).

In [1]:
from bs4 import BeautifulSoup
import pandas as pd

import requests

In [2]:
def get_tables(url):
    try :
        # Send an HTTP GET request to the website
        response = requests.get(url)

        # Parse the HTML code using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title = soup.select('title')[0].get_text()
        match_id = url.split('/')[-2].split('-')[-1]
        
        print("Fetch data for :", title) 
        print("With Match_id :", match_id)

    except OSError :
        print("Check Your Internet!")

    except :
        raise Exception("Sorry!") 
        
    
    table_list = soup.select('table thead tr')
    main_column = []

    for table in table_list:
        list_items = [ items.get_text() for items in table.select('th') ]
        main_column.append(list_items)
        # print(list_items)

    # print(main_column)
    
    table_list = soup.select('table tbody')

    main_table = []
    for table in table_list:
        record = []

        for row in table.select('tr'):
            item_list = [ items.get_text() for items in row.select('td')]
            if len(item_list) <= 2 :
                continue
            record.append(item_list)

        main_table.append(record)
    # len(main_table)
    
    batting_1 = pd.DataFrame(main_table[0], columns = main_column[0])
    bowling_1 = pd.DataFrame(main_table[1], columns = main_column[1])
    batting_2 = pd.DataFrame(main_table[2], columns = main_column[2])
    bowling_2 = pd.DataFrame(main_table[3], columns = main_column[3])
    
    batting_1['match_id'] = match_id
    bowling_1['match_id'] = match_id
    batting_2['match_id'] = match_id
    bowling_2['match_id'] = match_id
    
    point_table = pd.DataFrame(main_table[8], columns = main_column[7])
    point_table['match_id'] = match_id
    
    return batting_1, bowling_1, batting_2, bowling_2, point_table

In [3]:
url = "https://www.espncricinfo.com/series/indian-premier-league-2023-1345038/gujarat-titans-vs-chennai-super-kings-final-1370353/full-scorecard"
batting_1, bowling_1, batting_2, bowling_2, point_table = get_tables(url)

Fetch data for : GT vs CSK, Indian Premier League 2023, Final at Ahmedabad, May 28 - 29, 2023 - Full Scorecard
With Match_id : 1370353


In [4]:
batting_1.head(7)

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR,match_id
0,Wriddhiman Saha †,c †Dhoni b Chahar,54,39.0,63.0,5.0,1.0,138.46,1370353
1,Shubman Gill,st †Dhoni b Jadeja,39,20.0,34.0,7.0,0.0,195.0,1370353
2,Sai Sudharsan,lbw b Pathirana,96,47.0,58.0,8.0,6.0,204.25,1370353
3,Hardik Pandya (c),not out,21,12.0,32.0,0.0,2.0,175.0,1370353
4,Rashid Khan,c Gaikwad b Pathirana,0,2.0,2.0,0.0,0.0,0.0,1370353
5,Extras,"(b 1, lb 1, w 2)",4,,,,,,1370353
6,TOTAL,20 Ov (RR: 10.70),214/4,,,,,,1370353


In [5]:
match_results_df = pd.read_csv("ipl_match_results.csv")
match_results_df.head()

Unnamed: 0,team_1,team_2,winner,margin,ground,match_date,scorecard,match_id
0,Titans,Super Kings,Super Kings,5 wickets,Ahmedabad,"May 28, 2023",/series/indian-premier-league-2023-1345038/guj...,1370353
1,Titans,Mumbai,Titans,62 runs,Ahmedabad,"May 26, 2023",/series/indian-premier-league-2023-1345038/guj...,1370352
2,Super Giants,Mumbai,Mumbai,81 runs,Chennai,"May 24, 2023",/series/indian-premier-league-2023-1345038/luc...,1370351
3,Super Kings,Titans,Super Kings,15 runs,Chennai,"May 23, 2023",/series/indian-premier-league-2023-1345038/che...,1370350
4,RCB,Titans,Titans,6 wickets,Bengaluru,"May 21, 2023",/series/indian-premier-league-2023-1345038/roy...,1359544


In [6]:
scorecard_links = match_results_df.scorecard.tolist()
scorecard_links = [ "https://www.espncricinfo.com" + link for link in scorecard_links ]
scorecard_links[:3]

['https://www.espncricinfo.com/series/indian-premier-league-2023-1345038/gujarat-titans-vs-chennai-super-kings-final-1370353/full-scorecard',
 'https://www.espncricinfo.com/series/indian-premier-league-2023-1345038/gujarat-titans-vs-mumbai-indians-qualifier-2-1370352/full-scorecard',
 'https://www.espncricinfo.com/series/indian-premier-league-2023-1345038/lucknow-super-giants-vs-mumbai-indians-eliminator-1370351/full-scorecard']

In [7]:
batting_1, bowling_1, batting_2, bowling_2, point_table = get_tables(scorecard_links[72])
batting_1.head(7)

Fetch data for : PBKS vs KKR, Indian Premier League 2023, 2nd Match at Chandigarh, April 01, 2023 - Full Scorecard
With Match_id : 1359476


Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR,match_id
0,Prabhsimran Singh,c †Rahmanullah Gurbaz b Southee,23,12,10,2,2,191.66,1359476
1,Shikhar Dhawan (c),b Varun,40,29,75,6,0,137.93,1359476
2,Bhanuka Rajapaksa,c Singh b Yadav,50,32,47,5,2,156.25,1359476
3,Jitesh Sharma †,c Yadav b Southee,21,11,13,1,2,190.9,1359476
4,Sikandar Raza,c Rana b Narine,16,13,22,1,1,123.07,1359476
5,Sam Curran,not out,26,17,27,0,2,152.94,1359476
6,M Shahrukh Khan,not out,11,7,11,2,0,157.14,1359476


In [8]:
print("Total No of Matches :")
len(scorecard_links)

Total No of Matches :


1025

In [9]:
match_results_df.query("winner == 'no result'")

Unnamed: 0,team_1,team_2,winner,margin,ground,match_date,scorecard,match_id
29,Super Giants,Super Kings,no result,-,Lucknow,"May 3, 2023",/series/indian-premier-league-2023-1345038/luc...,1359519
279,RCB,Royals,no result,-,Bengaluru,"Apr 30, 2019",/series/ipl-2019-1165643/royal-challengers-ban...,1178424
512,RCB,Daredevils,no result,-,Bengaluru,"May 17, 2015",/series/pepsi-indian-premier-league-2015-79112...,829813
538,RCB,Royals,no result,-,Bengaluru,"Apr 29, 2015",/series/pepsi-indian-premier-league-2015-79112...,829763
744,RCB,Super Kings,no result,-,Bengaluru,"Apr 25, 2012",/series/indian-premier-league-2012-520932/roya...,548340
783,Daredevils,Warriors,no result,-,Delhi,"May 21, 2011",/series/indian-premier-league-2011-466304/delh...,501265


In [10]:
match_results_df = match_results_df.query("winner != 'no result'")
scorecard_links = match_results_df.scorecard.tolist()
scorecard_links = [ "https://www.espncricinfo.com" + link for link in scorecard_links ]

print("Total No of Matches :")
len(scorecard_links)

Total No of Matches :


1019

In [11]:
match_results_df[match_results_df.match_date.str.contains('2023')]

Unnamed: 0,team_1,team_2,winner,margin,ground,match_date,scorecard,match_id
0,Titans,Super Kings,Super Kings,5 wickets,Ahmedabad,"May 28, 2023",/series/indian-premier-league-2023-1345038/guj...,1370353
1,Titans,Mumbai,Titans,62 runs,Ahmedabad,"May 26, 2023",/series/indian-premier-league-2023-1345038/guj...,1370352
2,Super Giants,Mumbai,Mumbai,81 runs,Chennai,"May 24, 2023",/series/indian-premier-league-2023-1345038/luc...,1370351
3,Super Kings,Titans,Super Kings,15 runs,Chennai,"May 23, 2023",/series/indian-premier-league-2023-1345038/che...,1370350
4,RCB,Titans,Titans,6 wickets,Bengaluru,"May 21, 2023",/series/indian-premier-league-2023-1345038/roy...,1359544
...,...,...,...,...,...,...,...,...
69,RCB,Mumbai,RCB,8 wickets,Bengaluru,"Apr 2, 2023",/series/indian-premier-league-2023-1345038/roy...,1359479
70,Sunrisers,Royals,Royals,72 runs,Hyderabad,"Apr 2, 2023",/series/indian-premier-league-2023-1345038/sun...,1359478
71,Super Giants,Capitals,Super Giants,50 runs,Lucknow,"Apr 1, 2023",/series/indian-premier-league-2023-1345038/luc...,1359477
72,Punjab Kings,KKR,Punjab Kings,7 runs,Mohali,"Apr 1, 2023",/series/indian-premier-league-2023-1345038/pun...,1359476


In [12]:
# fetching data for recent 
batting_1 = pd.DataFrame()
bowling_1 = pd.DataFrame()
batting_2 = pd.DataFrame()
bowling_2 = pd.DataFrame()
point_table = pd.DataFrame()
for each_match in scorecard_links[:73]:
    batting_1t, bowling_1t, batting_2t, bowling_2t, point_table_t = get_tables(each_match)
    
    batting_1 = pd.concat([batting_1, batting_1t])
    bowling_1 = pd.concat([bowling_1, bowling_1t])
    batting_2 = pd.concat([batting_2, batting_2t])
    bowling_2 = pd.concat([bowling_2, bowling_2t])
    point_table = pd.concat([point_table, point_table_t])
    
# print(f"Data fetch for {} Total No of Matchs")

Fetch data for : GT vs CSK, Indian Premier League 2023, Final at Ahmedabad, May 28 - 29, 2023 - Full Scorecard
With Match_id : 1370353
Fetch data for : GT vs MI, Indian Premier League 2023, Qualifier 2 at Ahmedabad, May 26, 2023 - Full Scorecard
With Match_id : 1370352
Fetch data for : LSG vs MI, Indian Premier League 2023, Eliminator at Chennai, May 24, 2023 - Full Scorecard
With Match_id : 1370351
Fetch data for : CSK vs GT, Indian Premier League 2023, Qualifier 1 at Chennai, May 23, 2023 - Full Scorecard
With Match_id : 1370350
Fetch data for : RCB vs GT, Indian Premier League 2023, 70th Match at Bengaluru, May 21, 2023 - Full Scorecard
With Match_id : 1359544
Fetch data for : MI vs SRH, Indian Premier League 2023, 69th Match at Mumbai, May 21, 2023 - Full Scorecard
With Match_id : 1359543
Fetch data for : KKR vs LSG, Indian Premier League 2023, 68th Match at Kolkata, May 20, 2023 - Full Scorecard
With Match_id : 1359542
Fetch data for : DC vs CSK, Indian Premier League 2023, 67th M

In [13]:
batting_1

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR,match_id
0,Wriddhiman Saha †,c †Dhoni b Chahar,54,39,63,5,1,138.46,1370353
1,Shubman Gill,st †Dhoni b Jadeja,39,20,34,7,0,195.00,1370353
2,Sai Sudharsan,lbw b Pathirana,96,47,58,8,6,204.25,1370353
3,Hardik Pandya (c),not out,21,12,32,0,2,175.00,1370353
4,Rashid Khan,c Gaikwad b Pathirana,0,2,2,0,0,0.00,1370353
...,...,...,...,...,...,...,...,...,...
6,Ravindra Jadeja,c Shankar b Joseph,1,2,-,0,0,50.00,1359475
7,MS Dhoni (c)†,not out,14,7,-,1,1,200.00,1359475
8,Mitchell Santner,not out,1,3,-,0,0,33.33,1359475
9,Extras,"(b 1, lb 6, nb 1)",8,,,,,,1359475


- 

In [14]:
14 in batting_1.groupby('match_id').cumcount().add(1).tolist()

False

In [15]:
14 in batting_2.groupby('match_id').cumcount().add(1).tolist()

False

In [16]:
batting_1['batting_position'] = batting_1.groupby('match_id').cumcount().add(1)
batting_2['batting_position'] = batting_2.groupby('match_id').cumcount().add(1)

In [17]:
batting_1[batting_1['BATTING'] == 'Extras']

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR,match_id,batting_position
5,Extras,"(b 1, lb 1, w 2)",4,,,,,,1370353,6
5,Extras,"(lb 2, nb 2, w 6)",10,,,,,,1370352,6
9,Extras,"(b 8, lb 2, w 6)",16,,,,,,1370351,10
8,Extras,"(b 1, lb 2, nb 2)",5,,,,,,1370350,9
7,Extras,"(lb 2, w 5)",7,,,,,,1359544,8
...,...,...,...,...,...,...,...,...,...,...
9,Extras,"(nb 1, w 10)",11,,,,,,1359479,10
7,Extras,"(b 5, lb 1, w 2)",8,,,,,,1359478,8
8,Extras,"(b 1, lb 2, w 5)",8,,,,,,1359477,9
7,Extras,"(lb 1, nb 1, w 2)",4,,,,,,1359476,8


In [18]:
bowling_1

Unnamed: 0,BOWLING,O,M,R,W,ECON,0s,4s,6s,WD,NB,match_id
0,Deepak Chahar,4,0,38,1,9.50,5,3,1,0,0,1370353
1,Tushar Deshpande,4,0,56,0,14.00,4,7,3,0,0,1370353
2,Maheesh Theekshana,4,0,36,0,9.00,8,3,2,0,0,1370353
3,Ravindra Jadeja,4,0,38,1,9.50,3,3,1,0,0,1370353
4,Matheesha Pathirana,4,0,44,2,11.00,7,4,2,2,0,1370353
...,...,...,...,...,...,...,...,...,...,...,...,...
1,Hardik Pandya,3,0,28,0,9.33,6,2,2,0,0,1359475
2,Josh Little,4,0,41,1,10.25,10,4,3,0,0,1359475
3,Rashid Khan,4,0,26,2,6.50,10,2,1,0,0,1359475
4,Alzarri Joseph,4,0,33,2,8.25,8,0,3,0,0,1359475


In [19]:
point_table.head(10)

Unnamed: 0,TEAM,M,W,L,PT,NRR,match_id
0,GT,14,10,4,20,0.809,1370353
1,CSK,14,8,5,17,0.652,1370353
2,LSG,14,8,5,17,0.284,1370353
3,MI,14,8,6,16,-0.044,1370353
4,RR,14,7,7,14,0.148,1370353
5,RCB,14,7,7,14,0.135,1370353
6,KKR,14,6,8,12,-0.239,1370353
7,PBKS,14,6,8,12,-0.304,1370353
8,DC,14,5,9,10,-0.808,1370353
9,SRH,14,4,10,8,-0.59,1370353


In [20]:
point_table.tail(10)

Unnamed: 0,TEAM,M,W,L,PT,NRR,match_id
0,GT,14,10,4,20,0.809,1359475
1,CSK,14,8,5,17,0.652,1359475
2,LSG,14,8,5,17,0.284,1359475
3,MI,14,8,6,16,-0.044,1359475
4,RR,14,7,7,14,0.148,1359475
5,RCB,14,7,7,14,0.135,1359475
6,KKR,14,6,8,12,-0.239,1359475
7,PBKS,14,6,8,12,-0.304,1359475
8,DC,14,5,9,10,-0.808,1359475
9,SRH,14,4,10,8,-0.59,1359475


**Note**
- **Above** we have total 5 Datasets : batting_1, bowling_1, batting_2, bowling_2, point_table.
- **batting_1** and **batting_2** can be mergerd bcz we have match_id to partition later if required.
- Similarly  **bowling_1** and **bowling_2**.
- **point_table** pointing to same datasets for each_match which is the series overal points.
##### So in the below steps we will merge **batting_1** / **batting_2** and **bowling_1** / **bowling_2**. 
##### For point_table, keep last match records.
###### later code will be refined, optimized, and will be available as .py or as a data pipeline. (*Current version of code was a base idea to collect data, and start analytics work.*)

In [21]:
batting = pd.concat([batting_1, batting_2])
bowling = pd.concat([bowling_1, bowling_2])
point_table = point_table.head(10)

In [22]:
batting_1.shape, batting_2.shape, batting.shape

((726, 10), (739, 10), (1465, 10))

In [23]:
bowling_1.shape, bowling_2.shape, bowling.shape

((443, 12), (449, 12), (892, 12))

In [24]:
point_table.shape

(10, 7)

In [25]:
batting.head()

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR,match_id,batting_position
0,Wriddhiman Saha †,c †Dhoni b Chahar,54,39,63,5,1,138.46,1370353,1
1,Shubman Gill,st †Dhoni b Jadeja,39,20,34,7,0,195.0,1370353,2
2,Sai Sudharsan,lbw b Pathirana,96,47,58,8,6,204.25,1370353,3
3,Hardik Pandya (c),not out,21,12,32,0,2,175.0,1370353,4
4,Rashid Khan,c Gaikwad b Pathirana,0,2,2,0,0,0.0,1370353,5


In [26]:
batting.columns = ['batsman', 'out', 'run', 'ball', 'minutes', '4s', '6s', 'sr', 'match_id', 'batting_position']
batting.head()

Unnamed: 0,batsman,out,run,ball,minutes,4s,6s,sr,match_id,batting_position
0,Wriddhiman Saha †,c †Dhoni b Chahar,54,39,63,5,1,138.46,1370353,1
1,Shubman Gill,st †Dhoni b Jadeja,39,20,34,7,0,195.0,1370353,2
2,Sai Sudharsan,lbw b Pathirana,96,47,58,8,6,204.25,1370353,3
3,Hardik Pandya (c),not out,21,12,32,0,2,175.0,1370353,4
4,Rashid Khan,c Gaikwad b Pathirana,0,2,2,0,0,0.0,1370353,5


In [27]:
bowling.head()

Unnamed: 0,BOWLING,O,M,R,W,ECON,0s,4s,6s,WD,NB,match_id
0,Deepak Chahar,4,0,38,1,9.5,5,3,1,0,0,1370353
1,Tushar Deshpande,4,0,56,0,14.0,4,7,3,0,0,1370353
2,Maheesh Theekshana,4,0,36,0,9.0,8,3,2,0,0,1370353
3,Ravindra Jadeja,4,0,38,1,9.5,3,3,1,0,0,1370353
4,Matheesha Pathirana,4,0,44,2,11.0,7,4,2,2,0,1370353


In [28]:
bowling.columns = ['bowling', 'over', 'maiden', 'run', 'wickets', 'economy_rate', '0s', '4s', '6s', 'wide', 'no_ball', 'match_id']
bowling.head()

Unnamed: 0,bowling,over,maiden,run,wickets,economy_rate,0s,4s,6s,wide,no_ball,match_id
0,Deepak Chahar,4,0,38,1,9.5,5,3,1,0,0,1370353
1,Tushar Deshpande,4,0,56,0,14.0,4,7,3,0,0,1370353
2,Maheesh Theekshana,4,0,36,0,9.0,8,3,2,0,0,1370353
3,Ravindra Jadeja,4,0,38,1,9.5,3,3,1,0,0,1370353
4,Matheesha Pathirana,4,0,44,2,11.0,7,4,2,2,0,1370353


In [29]:
batting.to_csv("ipl_2023_batting.csv", index=False)
print("ipl_2023_batting.csv downloaded successfully !!!")

ipl_2023_batting.csv downloaded successfully !!!


In [30]:
bowling.to_csv("ipl_2023_bowling.csv", index=False)
print("ipl_2023_bowling.csv downloaded successfully !!!")

ipl_2023_bowling.csv downloaded successfully !!!


In [31]:
point_table.to_csv("ipl_2023_point_table.csv", index=False)
print("ipl_2023_point_table.csv downloaded successfully !!!")

ipl_2023_point_table.csv downloaded successfully !!!
