This Notebook is used to demonstrate a Python script that extracts indian-premier-league data from a https://www.espncricinfo.com/ using web scraping techniques. 
It leverages the requests library to fetch HTML content and BeautifulSoup to parse and extract the desired information.
At the end we have download the fetched data as a CSV file.

Click [link](https://www.espncricinfo.com/records/trophy/team-match-results/indian-premier-league-117) to visit the source page(Official ESPN).

In [1]:
from bs4 import BeautifulSoup
import pandas as pd

import requests

In [2]:
# URL of the espncricinfo to scrape
url = "https://www.espncricinfo.com/series/indian-premier-league-2023-1345038/gujarat-titans-vs-chennai-super-kings-final-1370353/full-scorecard"

try :
    # Send an HTTP GET request to the website
    response = requests.get(url)

    # Parse the HTML code using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
except OSError :
    print("Check Your Internet!")
    
except :
    raise Exception("Sorry!") 

In [3]:
# Table header/column details fetch.
table_list = soup.select('table thead tr')
main_column = []

for table in table_list:
    list_items = [ items.get_text() for items in table.select('th') ]
    main_column.append(list_items)
    # print(list_items)
    
print(main_column)
len(table)
# table

[['BATTING', '\xa0', 'R', 'B', 'M', '4s', '6s', 'SR'], ['BOWLING', 'O', 'M', 'R', 'W', 'ECON', '0s', '4s', '6s', 'WD', 'NB'], ['BATTING', '\xa0', 'R', 'B', 'M', '4s', '6s', 'SR'], ['BOWLING', 'O', 'M', 'R', 'W', 'ECON', '0s', '4s', '6s', 'WD', 'NB'], ['PLAYER NAME', 'R', 'B'], ['PLAYER NAME', 'O', 'M', 'R', 'W', 'Econ'], ['WKT', 'RUNS', 'PLAYERS'], ['TEAM', 'M', 'W', 'L', 'PT', 'NRR']]


6

In [4]:
# Table header/column details fetch.
table_list = soup.select('table tbody')

main_table = []
for table in table_list:
    record = []
    
    for row in table.select('tr'):
        item_list = [ items.get_text() for items in row.select('td')]
        if len(item_list) <= 2 :
            continue
        record.append(item_list)
        
    main_table.append(record)
len(main_table)

9

Basically for 1st colum

In [5]:
# 1st_table contains BATTING details of 1st team.
len(main_table[0][0]), len(main_column[0])

(8, 8)

In [6]:
batting_1 = pd.DataFrame(main_table[0], columns = main_column[0])
batting_1

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR
0,Wriddhiman Saha †,c †Dhoni b Chahar,54,39.0,63.0,5.0,1.0,138.46
1,Shubman Gill,st †Dhoni b Jadeja,39,20.0,34.0,7.0,0.0,195.0
2,Sai Sudharsan,lbw b Pathirana,96,47.0,58.0,8.0,6.0,204.25
3,Hardik Pandya (c),not out,21,12.0,32.0,0.0,2.0,175.0
4,Rashid Khan,c Gaikwad b Pathirana,0,2.0,2.0,0.0,0.0,0.0
5,Extras,"(b 1, lb 1, w 2)",4,,,,,
6,TOTAL,20 Ov (RR: 10.70),214/4,,,,,


In [7]:
# 2nd_table contains BOWLING details of 1st team.
len(main_table[1][0]), len(main_column[1])

(11, 11)

In [8]:
bowling_1 = pd.DataFrame(main_table[1], columns = main_column[1])
bowling_1

Unnamed: 0,BOWLING,O,M,R,W,ECON,0s,4s,6s,WD,NB
0,Deepak Chahar,4,0,38,1,9.5,5,3,1,0,0
1,Tushar Deshpande,4,0,56,0,14.0,4,7,3,0,0
2,Maheesh Theekshana,4,0,36,0,9.0,8,3,2,0,0
3,Ravindra Jadeja,4,0,38,1,9.5,3,3,1,0,0
4,Matheesha Pathirana,4,0,44,2,11.0,7,4,2,2,0


In [9]:
# 3rd_table contains BATTING details of 2nd team.
len(main_table[2][0]), len(main_column[2])

(8, 8)

In [10]:
batting_2 = pd.DataFrame(main_table[2], columns = main_column[2])
batting_2

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR
0,Ruturaj Gaikwad,c Rashid Khan b Noor Ahmad,26,16.0,30.0,3.0,1.0,162.5
1,Devon Conway,c Sharma b Noor Ahmad,47,25.0,34.0,4.0,2.0,188.0
2,Shivam Dube,not out,32,21.0,49.0,0.0,2.0,152.38
3,Ajinkya Rahane,c Shankar b Sharma,27,13.0,20.0,2.0,2.0,207.69
4,Ambati Rayudu,c & b Sharma,19,8.0,8.0,1.0,2.0,237.5
5,MS Dhoni (c)†,c Miller b Sharma,0,1.0,1.0,0.0,0.0,0.0
6,Ravindra Jadeja,not out,15,6.0,14.0,1.0,1.0,250.0
7,Extras,"(lb 1, w 4)",5,,,,,
8,TOTAL,15 Ov (RR: 11.40),171/5,,,,,


In [11]:
# 4th_table contains BOWLING details of 2nd team.
len(main_table[3][0]), len(main_column[3])

(11, 11)

In [12]:
bowling_2 = pd.DataFrame(main_table[3], columns = main_column[3])
bowling_2

Unnamed: 0,BOWLING,O,M,R,W,ECON,0s,4s,6s,WD,NB
0,Mohammed Shami,3,0,29,0,9.66,5,4,0,0,0
1,Hardik Pandya,1,0,14,0,14.0,1,1,1,1,0
2,Rashid Khan,3,0,44,0,14.66,2,4,3,0,0
3,Noor Ahmad,3,0,17,2,5.66,6,0,0,3,0
4,Josh Little,2,0,30,0,15.0,1,0,3,0,0
5,Mohit Sharma,3,0,36,3,12.0,4,2,3,0,0


In [13]:
# 5th_table contains BOWLING details of 2nd team.
len(main_table[5][0]), len(main_column[4])

(4, 3)

In [14]:
main_table[5][2], main_column[4]

(['S Dube', 'not out', '32', '21'], ['PLAYER NAME', 'R', 'B'])

In [15]:
# 5th_table contains BOWLING details of 2nd team.
len(main_table[6][0]), len(main_column[5])

(6, 6)

In [16]:
main_table[6][2], main_column[5]

(['Rashid Khan', '3', '0', '44', '0', '14.66 '],
 ['PLAYER NAME', 'O', 'M', 'R', 'W', 'Econ'])

In [17]:
main_table[8][2], main_column[7]

(['LSG', '14', '8', '5', '17', '0.284'], ['TEAM', 'M', 'W', 'L', 'PT', 'NRR'])

In [18]:
point_table = pd.DataFrame(main_table[8], columns = main_column[7])
point_table

Unnamed: 0,TEAM,M,W,L,PT,NRR
0,GT,14,10,4,20,0.809
1,CSK,14,8,5,17,0.652
2,LSG,14,8,5,17,0.284
3,MI,14,8,6,16,-0.044
4,RR,14,7,7,14,0.148
5,RCB,14,7,7,14,0.135
6,KKR,14,6,8,12,-0.239
7,PBKS,14,6,8,12,-0.304
8,DC,14,5,9,10,-0.808
9,SRH,14,4,10,8,-0.59


We have single table in the above url, used table -> thead -> tr -> td to get columns details and table -> tbody -> tr -> td to get records.

In [19]:
batting_1.head()

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR
0,Wriddhiman Saha †,c †Dhoni b Chahar,54,39,63,5,1,138.46
1,Shubman Gill,st †Dhoni b Jadeja,39,20,34,7,0,195.0
2,Sai Sudharsan,lbw b Pathirana,96,47,58,8,6,204.25
3,Hardik Pandya (c),not out,21,12,32,0,2,175.0
4,Rashid Khan,c Gaikwad b Pathirana,0,2,2,0,0,0.0


In [20]:
bowling_2.head()

Unnamed: 0,BOWLING,O,M,R,W,ECON,0s,4s,6s,WD,NB
0,Mohammed Shami,3,0,29,0,9.66,5,4,0,0,0
1,Hardik Pandya,1,0,14,0,14.0,1,1,1,1,0
2,Rashid Khan,3,0,44,0,14.66,2,4,3,0,0
3,Noor Ahmad,3,0,17,2,5.66,6,0,0,3,0
4,Josh Little,2,0,30,0,15.0,1,0,3,0,0


In [21]:
point_table.head()

Unnamed: 0,TEAM,M,W,L,PT,NRR
0,GT,14,10,4,20,0.809
1,CSK,14,8,5,17,0.652
2,LSG,14,8,5,17,0.284
3,MI,14,8,6,16,-0.044
4,RR,14,7,7,14,0.148


There are some conflict like :
- Super Giants and Supergiants are both same team, Also we have some 'no result' records as well.
- scorecard column has the link to get each match, with the place where match was played and match unique_id.

**Note:** In later steps all the confilcts will be trasformed.

In [22]:
bowling_2.to_csv("bowling_2_2023_final.csv", index=False)
print("bowling_2_2023_final.csv downloaded !!!")

bowling_2_2023_final.csv downloaded !!!
