# Importing necessary libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By

# Loading our saved variables

In [4]:
import pickle
def save_variables(Listinput):
    pickle.dump(Listinput, open("cricket_variables.p", "wb"),protocol = -1)
    print("Data Saved")

def load_variables():
    Listoutput = pickle.load(open("cricket_variables.p","rb"))
    return Listoutput

In [40]:
variables = [tests, series_length, countries, series_list, number_of_tests_for_series, series_links, not_abandoned_match_links, abandoned_matches, final_match_list, Series_Wise_Match_List]
save_variables(variables)

Data Saved


In [5]:
tests, series_length, countries, series_list, number_of_tests_for_series, series_links, not_abandoned_match_links, abandoned_matches, final_match_list, Series_Wise_Match_List = load_variables()

# Making sure that we dont cross the limit for number of times we extract data from the site

In [5]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)

# Getting the html code of any page in a text file

Simply using this as a reference point to be able to know what code im working with for a particular site and where to find the specific information I want that Im looking for from that site

In [3]:
# The URL of the webpage you want to fetch
url = "https://www.espncricinfo.com/series/sri-lanka-tour-of-india-1982-83-61636/india-vs-sri-lanka-only-test-63320/full-scorecard"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Get the HTML content
    html_content = response.text

    # Optional: Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # The filename to save the HTML content
    file_name = "webpage_content.txt"

    # Write the formatted HTML into a text file
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(soup.prettify())  # Nicely formatted HTML with indentation

    print(f"HTML content has been saved to '{file_name}'.")

    # Automatically open the text file after saving
    os.startfile(file_name)

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


HTML content has been saved to 'webpage_content.txt'.


# List of series

Here I mean to extract the names and years of all series from the all series page

This block of code is simply built to extract the name and year from the series page

In [3]:
def test_series_names_list():
    # Fetch the webpage content
    url = 'https://www.espncricinfo.com/records/list-of-series-results-335431'
    res = session.get(url)
    
    # Parse the HTML content
    soup = BeautifulSoup(res.content, 'html.parser')
    
    # Find all anchor tags with the specified class and extract the text
    links = soup.find_all('a', class_='ds-inline-flex ds-items-start ds-leading-none')
    
    tests = []
    
    for link in links:
        title_text = link['title']  # Extract the title from the anchor tag
        
        # Extract the href and parse the year using regex
        href = link['href']
        year_match = re.search(r'(\d{4}-\d{2}|\d{4})', href)  # Match a year in the format YYYY or YYYY-YY
        
        if year_match:
            year = year_match.group(0)  # Extract the year from the href
            year_split = year.split('-')
            year1 = year_split[0]
            year1_copy = year1
            year2 = int(year_split[1])
            year1 = int(year1[2:])
            if year1+1 != year2:
                year = year1_copy  
            full_title = f"{title_text} {year}"  # Combine the title and the year
            tests.append(full_title)

    return tests

This block of code is just to print our outcome

In [6]:
tests = test_series_names_list()
series_length = len(tests)
print("Number of test series:",series_length)
print()
print("List of All Test Series:")
for i in tests:
    print(i)

Number of test series: 849

List of All Test Series:
England in Australia Test Series 1876-77
England in Australia Test Match 1878-79
Australia in England Test Match 1880
England in Australia Test Series 1881-82
Australia in England Test Match 1882
The Ashes (England in Australia) 1882-83
England in Australia Test Match 1882-83
The Ashes (Australia in England) 1884
The Ashes (England in Australia) 1884-85
The Ashes (Australia in England) 1886
The Ashes (England in Australia) 1886-87
The Ashes (England in Australia) 1887-88
The Ashes (Australia in England) 1888
England in South Africa Test Series 1888-89
The Ashes (Australia in England) 1890
England in South Africa Test Match 1891-92
The Ashes (England in Australia) 1891-92
The Ashes (Australia in England) 1893
The Ashes (England in Australia) 1894-95
England in South Africa Test Series 1895-96
The Ashes (Australia in England) 1896
The Ashes (England in Australia) 1897-98
England in South Africa Test Series 1898-99
The Ashes (Australia 

As you can see above, the number of series is 841\
We have gotten the same number of rows in the first sheet on excel

Here we are checking whether the series names we have extracted only contain the names of the countries we have listed\
This is done to make sure we arent extracting any domestic matches by mistake, or any other country that we haven't accounted for.

In [7]:
countries = ["England", "Australia", "New Zealand", "South Africa", "India", "Pakistan", "West Indies", 
             "Afghanistan", "Ireland", "Sri Lanka", "Bangladesh", "Zimbabwe", "ICC World XI"]

for i in tests:
    # Check if none of the countries are present in the string 'i'
    if not any(country in i for country in countries):
        print(i)


# Lit of Series with their Win Margins

Now along with the name and period of the series I also want to extract the win margin and the winners of each series\
The next block of code is simply a function for that, along with the output of the list where we have stored it

In [8]:
# Fetch the webpage content
url = 'https://www.espncricinfo.com/records/list-of-series-results-335431'
resp = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(resp.content, 'html.parser')

# Initialize lists to store the extracted data
series_names = []
seasons = []
winners = []
margins = []

# Find all the relevant table rows
rows = soup.find_all('tr', class_='')
rows_left =  soup.find_all('tr', class_='ds-bg-ui-fill-translucent')
for i in rows_left:
    rows.append(i)

for row in rows:
    # Extract series name
    series_name_tag = row.find('a', class_='ds-inline-flex ds-items-start ds-leading-none')
    if series_name_tag:
        series_names.append(series_name_tag['title'])

    # Extract season
    season_tag = row.find_all('td', class_='ds-min-w-max ds-text-right')[0]
    if season_tag:
        seasons.append(season_tag.text.strip())

    # Extract winner
    winner_tag = row.find_all('td', class_='ds-min-w-max ds-text-right')[1]
    if winner_tag:
        winners.append(winner_tag.text.strip())

    # Extract margin
    margin_tag = row.find_all('td', class_='ds-min-w-max ds-text-right')[2]
    if margin_tag:
        margins.append(margin_tag.text.strip())

seasons = seasons[1:]
winners = winners[1:]
margins = margins[1:]

# Print the results
# print("Series Names:", series_names)
# print("Seasons:", seasons)
# print("Winners:", winners)
# print("Margins:", margins)

series_list = []
for i in range(len(seasons)):
    series_list.append([])
    series_list[i].append(series_names[i])
    series_list[i].append(seasons[i])
    series_list[i].append(winners[i])
    series_list[i].append(margins[i])

for i in series_list:
    print(i)

['England in Australia Test Series', '1876/77', 'drawn', '1-1 (2)']
['England in Australia Test Match', '1878/79', 'Australia', '1-0 (1)']
['Australia in England Test Match', '1880', 'England', '1-0 (1)']
['England in Australia Test Series', '1881/82', 'Australia', '2-0 (4)']
['Australia in England Test Match', '1882', 'Australia', '1-0 (1)']
['The Ashes (England in Australia)', '1882/83', 'England', '2-1 (3)']
['England in Australia Test Match', '1882/83', 'Australia', '1-0 (1)']
['The Ashes (Australia in England)', '1884', 'England', '1-0 (3)']
['The Ashes (England in Australia)', '1884/85', 'England', '3-2 (5)']
['The Ashes (Australia in England)', '1886', 'England', '3-0 (3)']
['The Ashes (England in Australia)', '1886/87', 'England', '2-0 (2)']
['The Ashes (England in Australia)', '1887/88', 'England', '1-0 (1)']
['The Ashes (Australia in England)', '1888', 'England', '2-1 (3)']
['England in South Africa Test Series', '1888/89', 'England', '2-0 (2)']
['The Ashes (Australia in Engl

For particular series, like the triangular, quadrangular series, and milestone series like the centenary test where the match got drawn, they didnt write the number of matches in that series, and thus I have manually input the number of matches for those series in the following block

In [9]:
series_list[38][3] = '9'
series_list[206][3] = '1'
series_list[225][3] = '1'
series_list[229][3] = '1'
series_list[288][3] = '1'
series_list[357][3] = '1'
series_list[411][3] = '4'
series_list[463][3] = '3'

Code for simply extracting number of matches in each series into a list, along with the number of series and the total number of matches overall

In [10]:
number_of_tests_for_series = []
for i in series_list:
    margin = i[3]
    if '(' in margin:
        margin = (margin.split('(')[1]).split(')')[0]
    number_of_tests_for_series.append(margin)

Printing the output for number of matches in each series

In [11]:
print(number_of_tests_for_series)
print(len(number_of_tests_for_series))
number_of_tests = 0
for i in number_of_tests_for_series:
    if i!='-':
        number_of_tests += int(i)
print(number_of_tests)

['2', '1', '1', '4', '1', '3', '1', '3', '5', '3', '2', '1', '3', '2', '2', '1', '3', '3', '5', '3', '3', '5', '2', '5', '5', '5', '3', '5', '5', '5', '3', '5', '5', '5', '5', '5', '3', '3', '9', '3', '5', '5', '5', '3', '5', '5', '5', '5', '5', '3', '5', '5', '4', '4', '5', '5', '5', '3', '5', '2', '1', '5', '2', '3', '3', '5', '4', '5', '5', '3', '5', '3', '4', '5', '3', '1', '3', '5', '1', '5', '5', '4', '5', '5', '5', '4', '5', '4', '5', '2', '5', '5', '5', '2', '4', '5', '5', '2', '5', '5', '5', '5', '4', '5', '5', '2', '5', '5', '3', '5', '4', '5', '1', '3', '5', '5', '5', '5', '5', '5', '5', '2', '3', '5', '3', '5', '5', '5', '5', '5', '5', '5', '3', '5', '5', '5', '5', '3', '5', '5', '5', '3', '5', '3', '1', '1', '3', '5', '4', '3', '5', '3', '3', '5', '3', '5', '3', '5', '3', '3', '4', '4', '5', '5', '5', '3', '3', '3', '3', '3', '3', '5', '4', '7', '2', '5', '3', '3', '5', '5', '3', '5', '3', '3', '5', '3', '3', '3', '3', '5', '3', '3', '5', '6', '2', '2', '4', '6', '3', '4',

# List of Series Links
This code is to extract the links of all series from that initial page

In [12]:
def test_series_links_list():
    # Base URL for the website
    base_url = 'https://www.espncricinfo.com'
    
    # Fetch the webpage content
    url = 'https://www.espncricinfo.com/records/list-of-series-results-335431'
    response = requests.get(url)
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize a list to store the full URLs
    urls = []
    
    # Find all anchor tags with the specified class and extract the href
    links = soup.find_all('a', class_='ds-inline-flex ds-items-start ds-leading-none')
    
    for link in links:
        href = link['href']  # Extract the href from the anchor tag
        full_url = base_url + href  # Construct the full URL by appending base URL
        if full_url.count("http") == 1:
            urls.append(full_url)  # Append the full URL to the list

    return urls

Printing the output along with the total number of series

In [13]:
series_links = test_series_links_list()
length_series_links = len(series_links)
print("Number of Test Series:",length_series_links)
print("URLs for all Test Series:")
for url in series_links:
    print(url)

Number of Test Series: 849
URLs for all Test Series:
https://www.espncricinfo.com/series/england-in-australia-test-series-1876-77-60260/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/england-in-australia-test-match-1878-79-60261/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/australia-in-england-test-match-1880-60262/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/england-in-australia-test-series-1881-82-60263/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/australia-in-england-test-match-1882-60264/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/the-ashes-1882-83-60265/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/england-in-australia-test-match-1882-83-60266/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/the-ashes-1884-60267/match-schedule-fixtures-and-results
https://www.espncricinfo.com/series/the-ashes-1884-85-

As you can see here too we have 841 series links

# List of match links



Getting the links for all the matches within each series link.\
We first get all the links of matches in that series and store it in a list.\
Then all the lists for all series go into a super list, so we have a list of lists.\
It looks like this:\
[\
[match 1,match2,match 3],\
[match 1, match 2, match 3, match 4],\
[match 1, match 2],\
...]

In [33]:
def scorecard_link_generator(series_url):
    base_url = 'https://www.espncricinfo.com'
    series_response = requests.get(series_url)
    series_soup = BeautifulSoup(series_response.content, 'html.parser')

    # Find all match scorecard links on the series page
    match_list = []
    scorecard_links = series_soup.find_all('a', class_='ds-no-tap-higlight')

    series_identifier = ""
    count = 0
    # Start from the last link and go backward
    for scorecard in reversed(scorecard_links):
        match_href = scorecard['href']  # Extract href for the scorecard
        full_match_url = base_url + match_href  # Construct the full match scorecard URL
        
        if count ==  0:
            series_identifier = match_href.split('/series/')[1][:20]
            match_list.insert(0,full_match_url)
            count += 1
            continue
      
        # Check if the link is relevant to the series
        match_identifier = match_href.split('/series/')[1][:20]
        
        if match_identifier == series_identifier:
            if "live-cricket-score" in full_match_url:
                full_match_url = full_match_url.split("/live-cricket-score")[0]+"/full-scorecard"
            # Insert at the beginning of the list for this series
            match_list.insert(0, full_match_url)

        else:
            # Stop if the link is not relevant to the series
            break

    return match_list

def final_all_test_links(series_links):
    # Now, for each series URL, find all the match scorecard links within the specified <div> classes
    match_links = []  # This will be the 2D list to store scorecard links for all series
    for series_url in series_links:
        match_list = scorecard_link_generator(series_url)
        match_links.append(match_list)

    return match_links

Printing the output as:\
Series 1 Name:\
Match 1 Link\
Match 2 Link...

Series 2 Name:\
Match 1 Link\
Match 2 Link...

Also finding the amount of time it took to print this(trust me it's a lot)

In [34]:
begin = time.time() 

match_links = final_all_test_links(series_links)

# Print or return the 2D list of match scorecard links
print("Match Scorecard Links for All Series:")
print()
for i in range(len(match_links)):
    print("Series Name:", tests[i])
    for link in match_links[i]:
        print(link)
    print()

end = time.time() 

Match Scorecard Links for All Series:

Series Name: England in Australia Test Series 1876-77
https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-1st-test-62396/full-scorecard
https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-2nd-test-62397/full-scorecard

Series Name: England in Australia Test Match 1878-79
https://www.espncricinfo.com/series/england-tour-of-australia-1878-79-61733/australia-vs-england-only-test-62398/full-scorecard

Series Name: Australia in England Test Match 1880
https://www.espncricinfo.com/series/australia-tour-of-england-1880-61336/england-vs-australia-only-test-62399/full-scorecard

Series Name: England in Australia Test Series 1881-82
https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-1st-test-62400/full-scorecard
https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-2nd-test-62401/f

#### I have no idea on god's green earth what this was supposed to do

In [35]:
match_links_final_list = match_links

Printing the total time it took to get all match links

In [36]:
# total time taken  
total_time_taken = end-begin
minutes = str(int(total_time_taken/60))
seconds = str(round(total_time_taken % 60,3))
total_time_taken = minutes+"m "+seconds+"s"
print(f"Total runtime of the program is {total_time_taken}") 

Total runtime of the program is 9m 6.822s


I basically wrote this code to go through our list of matches from first to last and find the maximum number of matches in a test series, and everytime it found a new series that was longer than our previous max, then to make that the new max and print it\
Only done to find how many max matches a series has ever had\
Did this cause when I tried printing all match links in a normal table I just made 5 columns and it gave some stupid error so I wanted to find out why

In [37]:
#print(match_links)
maxlen = 0
series = []
for i in match_links:
    length = len(i)
    if length>maxlen:
        maxlen = length
        series = i
        print(length,series)

print(maxlen)
print(series)

2 ['https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-1st-test-62396/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-2nd-test-62397/full-scorecard']
4 ['https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-1st-test-62400/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-2nd-test-62401/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-3rd-test-62402/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-4th-test-62403/full-scorecard']
5 ['https://www.espncricinfo.com/series/england-tour-of-australia-1884-85-61734/australia-vs-england-1st-test-62412/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1884-85-61734/aust

This is where I tried making that table lol\

In [38]:
# Column names
columns = ['Match 1 Link', 'Match 2 Link', 'Match 3 Link', 'Match 4 Link','Match 5 Link','Match 6 Link','Match 7 Link','Match 8 Link','Match 9 Link']

match_links_df = pd.DataFrame(match_links, columns=columns)
match_links_df.insert(0, "Series Name", tests, True)

# Displaying the DataFrame
display(match_links_df)

Unnamed: 0,Series Name,Match 1 Link,Match 2 Link,Match 3 Link,Match 4 Link,Match 5 Link,Match 6 Link,Match 7 Link,Match 8 Link,Match 9 Link
0,England in Australia Test Series 1876-77,https://www.espncricinfo.com/series/england-to...,https://www.espncricinfo.com/series/england-to...,,,,,,,
1,England in Australia Test Match 1878-79,https://www.espncricinfo.com/series/england-to...,,,,,,,,
2,Australia in England Test Match 1880,https://www.espncricinfo.com/series/australia-...,,,,,,,,
3,England in Australia Test Series 1881-82,https://www.espncricinfo.com/series/england-to...,https://www.espncricinfo.com/series/england-to...,https://www.espncricinfo.com/series/england-to...,https://www.espncricinfo.com/series/england-to...,,,,,
4,Australia in England Test Match 1882,https://www.espncricinfo.com/series/australia-...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
843,Crowe-Thorpe Trophy (England in New Zealand) 2...,https://www.espncricinfo.com/series/england-in...,https://www.espncricinfo.com/series/england-in...,https://www.espncricinfo.com/series/england-in...,,,,,,
844,Border-Gavaskar Trophy (India in Australia) 20...,https://www.espncricinfo.com/series/australia-...,https://www.espncricinfo.com/series/australia-...,https://www.espncricinfo.com/series/australia-...,https://www.espncricinfo.com/series/australia-...,https://www.espncricinfo.com/series/australia-...,,,,
845,Pakistan in South Africa Test Series 2024-25,https://www.espncricinfo.com/series/south-afri...,https://www.espncricinfo.com/series/south-afri...,,,,,,,
846,Afghanistan in Zimbabwe Test Series 2024-25,https://www.espncricinfo.com/series/afghanista...,https://www.espncricinfo.com/series/afghanista...,,,,,,,


Here I tried finding the total number of series and matches in our list and printed it\
Also, I wanted to find series where our number for that series was not matching the number we extracted above when just scraping names and margins

In [39]:
number_of_test_series = len(match_links)
number_of_test_matches = 0

for i in match_links:
    number_of_test_matches+=len(i)

print("Number of Test Series:",number_of_test_series)
print("Number of Test Matches:",number_of_test_matches)

for i in range(838):
    matches_in_series_links = match_links[i]
    length_of_series = len(matches_in_series_links)
    if length_of_series != int(number_of_tests_for_series[i]):
        print(tests[i])
        print(matches_in_series_links)
        print(length_of_series)
        print(number_of_tests_for_series[i])
        print(i)
        print()

Number of Test Series: 848
Number of Test Matches: 2597
The Ashes (Australia in England) 1890
['https://www.espncricinfo.com/series/australia-tour-of-england-1890-61339/england-vs-australia-1st-test-62428/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1890-61339/england-vs-australia-2nd-test-62429/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1890-61339/england-vs-australia-3rd-test-64135/full-scorecard']
3
2
14

The Ashes (Australia in England) 1938
['https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-1st-test-62649/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-2nd-test-62650/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-3rd-test-64136/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-4

# Removing Abandoned Matches

Now that I researched and found out there are some matches extra, turns out some of them were abandoned, so this code simply removes those\
This block simply takes a series url and outputs two lists, a list of played matches and a list of abandoned matches

Just to be clear, a match is abandoned if it says abandoned or cancelled.\
There were also matches that said postponed, but those matches were usually part of entire series that were cancelled so the series never showed up on our initial page.\
eg. https://www.espncricinfo.com/series/australia-in-bangladesh-2020-1219114/bangladesh-vs-australia-2nd-test-1219117/full-scorecard \
Try searching for this match on the series page\
Here's the series page\
https://www.espncricinfo.com/records/list-of-series-results-335431

In [143]:
def removing_abandoned_matches(series_url):
    base_url = 'https://www.espncricinfo.com'
    series_response = requests.get(series_url)
    series_soup = BeautifulSoup(series_response.content, 'html.parser')
    table = series_soup.find_all('div', class_="ds-grow ds-px-4 ds-border-r ds-border-line-default-translucent")
    match_listtt = []
    abandoned_matches = []
    for match in table: 
        abandoned_tag = match.find('span', class_='ds-text-tight-s ds-font-bold ds-uppercase ds-leading-5')
        if abandoned_tag != None:
            abandoned_tag = abandoned_tag.get_text().strip()
        full_match_url = "https://www.espncricinfo.com" + match.find('a', class_="ds-no-tap-higlight")['href']
        if abandoned_tag in ['ABANDONED','CANCELLED','POSTPONED']:
            abandoned_matches.append(full_match_url)
        elif abandoned_tag != None:
            if "live-cricket-score" in full_match_url:
                full_match_url = full_match_url.split("/live-cricket-score")[0] + "/full-scorecard"
            # Insert at the beginning of the list for this series
            match_listtt.insert(0, full_match_url)

    match_listtt.reverse()
    
    # Print or return the match_list to verify the results
    return match_listtt, abandoned_matches

In [59]:
series_url = "https://www.espncricinfo.com/series/the-ashes-1938-60332/match-schedule-fixtures-and-results"
match_listtt,abandoned_matches = removing_abandoned_matches(series_url)
print(len(match_listtt))
print(len(abandoned_matches))
print(match_listtt)
print(abandoned_matches)

4
1
['https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-5th-test-62652/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-4th-test-62651/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-2nd-test-62650/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-1st-test-62649/full-scorecard']
['https://www.espncricinfo.com/series/australia-tour-of-england-1938-61342/england-vs-australia-3rd-test-64136/full-scorecard']


This block takes all series links and runs the above function on every series link, and stores in two lists again:played and abandoned\
Except this time instead of only for that particular series we have a list for all matches ever, segregated into played and abandoned

In [144]:
def final_all_test_without_abandoned_links(series_linkss):
    # Now, for each series URL, find all the match scorecard links within the specified <div> classes
    match_links = []  # This will be the 2D list to store scorecard links for all series
    abandoned_match_list = []
    for series_url in series_linkss:
        match_list, abandoned_matches = removing_abandoned_matches(series_url)
        match_links.append(match_list)
        for match in abandoned_matches:
            abandoned_match_list.append(match)
    return abandoned_match_list, match_links

This block is to just run the abandoned wala code, and also to print the played matches, list of abandoned matches and the number of abandoned matches in that order

In [145]:
begin = time.time() 

abandoned_matches, not_abandoned_match_links = final_all_test_without_abandoned_links(series_links)

# Print or return the 2D list of match scorecard links
print("Match Scorecard Links for All Series:")
print()
for i in range(len(not_abandoned_match_links)):
    print("Series Name:", tests[i])
    for link in not_abandoned_match_links[i]:
        print(link)
    print()
print("List of abandoned matches:", abandoned_matches)
print("Number of abandoned matches =",len(abandoned_matches))

end = time.time() 

Match Scorecard Links for All Series:

Series Name: England in Australia Test Series 1876-77
https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-1st-test-62396/full-scorecard
https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-2nd-test-62397/full-scorecard

Series Name: England in Australia Test Match 1878-79
https://www.espncricinfo.com/series/england-tour-of-australia-1878-79-61733/australia-vs-england-only-test-62398/full-scorecard

Series Name: Australia in England Test Match 1880
https://www.espncricinfo.com/series/australia-tour-of-england-1880-61336/england-vs-australia-only-test-62399/full-scorecard

Series Name: England in Australia Test Series 1881-82
https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-1st-test-62400/full-scorecard
https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-2nd-test-62401/f

This is just for the time taken to run the above code

In [146]:
# total time taken 
total_time_taken = end-begin
minutes = str(int(total_time_taken/60))
seconds = str(round(total_time_taken % 60,3))
total_time_taken = minutes+"m "+seconds+"s"
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")  
print(f"Total runtime of the program is {total_time_taken}") 

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Total runtime of the program is 10m 34.004s


This code prints the number of played matches, the number of abandoned matches, adds them up and prints the total, and compares with the total we had before removing the abandoned matches

In [147]:
non_abandoned_matches = 0
for i in not_abandoned_match_links:
    non_abandoned_matches += len(i)
abandoned_matches_number = len(abandoned_matches)
print(non_abandoned_matches)
print(abandoned_matches_number)
print(non_abandoned_matches+abandoned_matches_number)
#print(number_of_test_matches)

2587
15
2602


Just to find duplicates in our non abandoned matches and print duplicates and non duplicates one after the other\
And then the number of matches we finally have(is block me thoda neeche scroll karna)

In [148]:
Series_Wise_Match_List = []
duplist = []
for i in not_abandoned_match_links:
    for j in i:
        if j not in Series_Wise_Match_List and "live-cricket-score" not in j:
            Series_Wise_Match_List.append(j)
        else:
            duplist.append((j,i))

for i in duplist:
    print(i[0])
    print(i[1])
    print()
print("-----------------------------------------")
for i in Series_Wise_Match_List:
    print(i)
    print()
print(len(Series_Wise_Match_List))

https://www.espncricinfo.com/series/triangular-tournament-1912-60296/australia-vs-south-africa-62387/full-scorecard
['https://www.espncricinfo.com/series/triangular-tournament-1912-60296/australia-vs-south-africa-62387/full-scorecard', 'https://www.espncricinfo.com/series/triangular-tournament-1912-60296/england-vs-south-africa-62388/full-scorecard', 'https://www.espncricinfo.com/series/triangular-tournament-1912-60296/england-vs-australia-62389/full-scorecard', 'https://www.espncricinfo.com/series/triangular-tournament-1912-60296/england-vs-south-africa-62390/full-scorecard', 'https://www.espncricinfo.com/series/triangular-tournament-1912-60296/australia-vs-south-africa-62391/full-scorecard', 'https://www.espncricinfo.com/series/triangular-tournament-1912-60296/england-vs-australia-62392/full-scorecard', 'https://www.espncricinfo.com/series/triangular-tournament-1912-60296/australia-vs-south-africa-62393/full-scorecard', 'https://www.espncricinfo.com/series/triangular-tournament-1912-

In [79]:
year_list = {}
for year in range(1877,2025):
    year_list[year] = []
    year1 = str(year)
    year2 = (int(year1)+1)%100
    year2 = str(year2)
    if int(year2)<10:
        year2 = "0"+year2
    year1 = "-"+year1
    full_year = year1+"-"+year2
    for match in Series_Wise_Match_List:
        if year1 in match:
            year_list[year].append(match)
updated_year_list = []

for key in year_list.keys():
    updated_year_list.append(year_list[key])
c = 0
years = list(year_list.keys())
for year in updated_year_list:
    print(years[c])
    c += 1
    print(len(year))
    print(year)
    print()
# for i in range(len(updated_year_list)):
#     if len(year)!=

1877
0
[]

1878
1
['https://www.espncricinfo.com/series/england-tour-of-australia-1878-79-61733/australia-vs-england-only-test-62398/full-scorecard']

1879
0
[]

1880
1
['https://www.espncricinfo.com/series/australia-tour-of-england-1880-61336/england-vs-australia-only-test-62399/full-scorecard']

1881
4
['https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-4th-test-62403/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-3rd-test-62402/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-2nd-test-62401/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-1st-test-62400/full-scorecard']

1882
5
['https://www.espncricinfo.com/series/australia-tour-of-england-1882-61352/england-vs-australia-only-test-62404/full-scorecard', 'https://www.espncricinfo.com/series

Finding the number of matches each country has played after removing abandoned and duplicates

In [149]:
country = ['afghanistan', 'australia', 'bangladesh', 'england', 'icc-world-xi', 'india', 
           'ireland', 'new-zealand', 'pakistan', 'south-africa', 'sri-lanka', 'west-indies', 'zimbabwe']
country_wise = []
for x in country:
    country_wise.append([])
for i in Series_Wise_Match_List:
    for j in range(len(country)):
        if country[j] in i.split('/')[-2]:
            country_wise[j].append(i)

In [150]:
total_count = 0
for i in range(len(country)):
    print(country[i]+": "+str(len(country_wise[i]))+" matches")
    total_count += len(country_wise[i])

print("Total:",total_count/2)

afghanistan: 11 matches
australia: 871 matches
bangladesh: 150 matches
england: 1083 matches
icc-world-xi: 1 matches
india: 588 matches
ireland: 9 matches
new-zealand: 478 matches
pakistan: 465 matches
south-africa: 472 matches
sri-lanka: 324 matches
west-indies: 584 matches
zimbabwe: 120 matches
Total: 2578.0


In [82]:
match = "https://www.espncricinfo.com/series/pakistan-tour-of-australia-1995-96-61485/australia-vs-pakistan-2nd-test-63698/full-scorecard"
c = 0
for i in Series_Wise_Match_List:
    c +=1
    if i == match:
        print(c,i)

1312 https://www.espncricinfo.com/series/pakistan-tour-of-australia-1995-96-61485/australia-vs-pakistan-2nd-test-63698/full-scorecard


In [83]:
for i in country_wise[8]:
    print(i)

https://www.espncricinfo.com/series/pakistan-tour-of-india-1952-53-61529/india-vs-pakistan-5th-test-62745/full-scorecard
https://www.espncricinfo.com/series/pakistan-tour-of-india-1952-53-61529/india-vs-pakistan-4th-test-62744/full-scorecard
https://www.espncricinfo.com/series/pakistan-tour-of-india-1952-53-61529/india-vs-pakistan-3rd-test-62743/full-scorecard
https://www.espncricinfo.com/series/pakistan-tour-of-india-1952-53-61529/india-vs-pakistan-2nd-test-62742/full-scorecard
https://www.espncricinfo.com/series/pakistan-tour-of-india-1952-53-61529/india-vs-pakistan-1st-test-62741/full-scorecard
https://www.espncricinfo.com/series/pakistan-tour-of-england-1954-61456/england-vs-pakistan-4th-test-62776/full-scorecard
https://www.espncricinfo.com/series/pakistan-tour-of-england-1954-61456/england-vs-pakistan-3rd-test-62775/full-scorecard
https://www.espncricinfo.com/series/pakistan-tour-of-england-1954-61456/england-vs-pakistan-2nd-test-62774/full-scorecard
https://www.espncricinfo.com/

Printing a list of all matches for each country

In [151]:
for i in range(len(country_wise)):
    print()
    print(country[i])
    for match in country_wise[i]:
        print(match)


afghanistan
https://www.espncricinfo.com/series/afg-in-india-2018-1133249/india-vs-afghanistan-only-test-1133983/full-scorecard
https://www.espncricinfo.com/series/afghanistan-v-ireland-2018-19-1168107/afghanistan-vs-ireland-only-test-1168120/full-scorecard
https://www.espncricinfo.com/series/afg-in-bangladesh-2019-1197133/bangladesh-vs-afghanistan-only-test-1197138/full-scorecard
https://www.espncricinfo.com/series/west-indies-in-india-2019-20-1186986/afghanistan-vs-west-indies-only-test-1193500/full-scorecard
https://www.espncricinfo.com/series/afghanistan-v-zimbabwe-2020-21-1252053/afghanistan-vs-zimbabwe-1st-test-1252056/full-scorecard
https://www.espncricinfo.com/series/afghanistan-v-zimbabwe-2020-21-1252053/afghanistan-vs-zimbabwe-2nd-test-1252057/full-scorecard
https://www.espncricinfo.com/series/afghanistan-in-bangladesh-2023-1377001/bangladesh-vs-afghanistan-only-test-1377011/full-scorecard
https://www.espncricinfo.com/series/sri-lanka-vs-afghanistan-2023-24-1416068/sri-lanka

In [85]:
numbers = []
for match in country_wise[2]:
    match_number = ((match.split('/')[-2]).split('-test')[0]).split('-')[-1]
    numbers.append(match_number)

for i in range(1,len(numbers)):
    if (numbers[i] == "6th" and numbers[i-1] == "5th") or (numbers[i] == "5th" and numbers[i-1] == "4th") or (numbers[i] == "4th" and numbers[i-1] == "3rd") or (numbers[i] == "3rd" and numbers[i-1] == "2nd") or (numbers[i] == "2nd" and numbers[i-1] == "1st") or (numbers[i] == "only") or (numbers[i] == "1st"):
        continue
    else:
        print(i,numbers[i-1],numbers[i])

1 only 2nd
9 1st 63947
10 63947 63946
11 63946 2nd
21 1st 3rd
22 3rd 2nd
46 1st 3rd
47 3rd 2nd
64 only 2nd
69 only 2nd
85 1st 3rd
86 3rd 2nd
91 only 2nd
98 only 2nd
115 only 2nd
119 only 2nd
124 only 2nd
138 only 2nd


In [86]:
print(country_wise[2][9])
print(country_wise[2][10])

https://www.espncricinfo.com/series/asian-test-championship-2001-02-60711/sri-lanka-vs-bangladesh-2nd-match-63947/full-scorecard
https://www.espncricinfo.com/series/asian-test-championship-2001-02-60711/pakistan-vs-bangladesh-1st-match-63946/full-scorecard


Getting a list of all the match and series ids, and trying to see if there are any duplicate match ids\
Found none

In [87]:
def ids(my_listt):
    Match_ID_list = []
    for i in range(len(my_listt)):
        match_id = my_listt[i].split("/")[-2].split('-')[-1]
        series_id = my_listt[i].split("/")[-3].split('-')[-1]
        Match_ID_list.append([])
        Match_ID_list[i].append(my_listt[i])
        Match_ID_list[i].append(match_id)
        Match_ID_list[i].append(series_id)
    return Match_ID_list
Match_ID_list = ids(country_wise[2])
for i in Match_ID_list:
    print()
    print(i[0])
    print(i[1])
    print(i[2])


https://www.espncricinfo.com/series/india-tour-of-bangladesh-2000-01-62353/bangladesh-vs-india-only-test-63898/full-scorecard
63898
62353

https://www.espncricinfo.com/series/bangladesh-tour-of-zimbabwe-2000-01-62182/zimbabwe-vs-bangladesh-2nd-test-63931/full-scorecard
63931
62182

https://www.espncricinfo.com/series/bangladesh-tour-of-zimbabwe-2000-01-62182/zimbabwe-vs-bangladesh-1st-test-63930/full-scorecard
63930
62182

https://www.espncricinfo.com/series/zimbabwe-tour-of-bangladesh-2001-02-62023/bangladesh-vs-zimbabwe-2nd-test-63957/full-scorecard
63957
62023

https://www.espncricinfo.com/series/zimbabwe-tour-of-bangladesh-2001-02-62023/bangladesh-vs-zimbabwe-1st-test-63956/full-scorecard
63956
62023

https://www.espncricinfo.com/series/bangladesh-tour-of-new-zealand-2001-02-62183/new-zealand-vs-bangladesh-2nd-test-63968/full-scorecard
63968
62183

https://www.espncricinfo.com/series/bangladesh-tour-of-new-zealand-2001-02-62183/new-zealand-vs-bangladesh-1st-test-63967/full-scoreca

Printing duplicate ids(as (match link, match id) for that match) and original ids(as (match link, match id) for that match) in that order

In [88]:
def finding_duplicates(Match_ID_copy):
    Match_ID_copy 
    match_ids = []
    id_duplicated = []
    for i in Match_ID_list:
        if i[1] in match_ids:
            id_duplicated.append((i[0],i[1]))
        else:
            match_ids.append((i[0],i[1]))
    
    for i in id_duplicated:
        print(i)
    
    print("------")
    for i in match_ids:
        print(i)
finding_duplicates(Match_ID_list)

------
('https://www.espncricinfo.com/series/india-tour-of-bangladesh-2000-01-62353/bangladesh-vs-india-only-test-63898/full-scorecard', '63898')
('https://www.espncricinfo.com/series/bangladesh-tour-of-zimbabwe-2000-01-62182/zimbabwe-vs-bangladesh-2nd-test-63931/full-scorecard', '63931')
('https://www.espncricinfo.com/series/bangladesh-tour-of-zimbabwe-2000-01-62182/zimbabwe-vs-bangladesh-1st-test-63930/full-scorecard', '63930')
('https://www.espncricinfo.com/series/zimbabwe-tour-of-bangladesh-2001-02-62023/bangladesh-vs-zimbabwe-2nd-test-63957/full-scorecard', '63957')
('https://www.espncricinfo.com/series/zimbabwe-tour-of-bangladesh-2001-02-62023/bangladesh-vs-zimbabwe-1st-test-63956/full-scorecard', '63956')
('https://www.espncricinfo.com/series/bangladesh-tour-of-new-zealand-2001-02-62183/new-zealand-vs-bangladesh-2nd-test-63968/full-scorecard', '63968')
('https://www.espncricinfo.com/series/bangladesh-tour-of-new-zealand-2001-02-62183/new-zealand-vs-bangladesh-1st-test-63967/full

Sorting the list of match ids from lowest to highest to see if there was any pattern, pattern stopped existing after a certain point

In [89]:
def sorted_match_id(Match_id):
    match_id_number = []
    for match in Match_id:
        match_id_number.append(int(match[1]))
    match_id_number.sort()
    print(match_id_number)
    print(len(match_id_number))

sorted_match_id(Match_ID_list)

[63898, 63930, 63931, 63946, 63947, 63956, 63957, 63967, 63968, 63972, 63973, 63995, 63996, 64007, 64008, 64018, 64019, 64028, 64029, 64036, 64037, 64043, 64044, 64045, 64052, 64053, 64069, 64070, 64089, 64090, 64103, 64104, 64111, 64112, 64121, 64122, 209929, 210366, 218813, 219612, 238167, 238168, 238171, 238172, 282691, 282692, 293478, 293479, 293480, 300429, 300430, 323947, 323948, 350345, 350346, 361758, 361759, 378750, 378751, 401071, 401072, 423786, 426401, 426402, 426423, 426424, 434256, 434257, 522245, 531986, 531987, 538072, 538073, 587469, 587470, 602472, 602473, 623576, 623577, 668949, 668951, 690347, 690349, 730295, 730297, 760781, 760783, 760785, 817213, 817215, 858493, 858495, 870729, 1019985, 1019987, 1029825, 1029827, 1041761, 1075502, 1075503, 1083444, 1083445, 1104284, 1104285, 1130744, 1130745, 1146717, 1146718, 1153311, 1153312, 1153639, 1153640, 1153847, 1153848, 1187016, 1187017, 1197138, 1213062, 1214666, 1244025, 1244026, 1255828, 1255829, 1267676, 1277100, 127

I thought the ids are all in a sequence of numbers, so I tried finding if any number was missing in that sequence.\
Also print what the fuck if the id was found more than once

In [None]:
for i in range(62396,64133):
    c = 0 
    for match in Match_ID_List:
        if str(i) in match:
            c += 1
    if c == 0:
        print("Unsuccessful:", i)
    elif c == 1:
        print("Successful:",i)
    else:
        print("What the fuck",i)

NameError: name 'match_ids' is not defined

In the numbers that the pattern did exist, I found that the numbers 63581 and 63582 were missing/showed some error, so I tried finding it in the original no abandon filter list

In [91]:
# @hidden_cell.
for i in match_links:
    for j in i:
        if "63851" in j or "63852" in j:
            print(j)
            print(i)
            print()

# Getting the scorecard for that match

This is code to get the scorecard for each test match, dont need to get into this

## Scorecard functions

### Scorecard function for a match given Match URL

In [42]:
def scorecard(url):
    response = requests.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize lists to store data for the match
    innings_list = []
    batsmen_list = pd.DataFrame()
    match_extras = pd.DataFrame()
    match_score = pd.DataFrame()
    did_not_bat = pd.DataFrame()
    fall_of_wickets = pd.DataFrame()  
    bowlers_info = pd.DataFrame()
    
    # Find all divs with class "ds-rounded-lg ds-mt-2" (which contain innings tables)
    innings_tables = soup.find_all('div', class_='ds-rounded-lg ds-mt-2')

    # Loop through each innings table div
    for innings_table in innings_tables:

        innings_number = innings_tables.index(innings_table) + 1

        # Find the div that contains the team name and innings number
        team_innings_div = innings_table.find('div', class_='ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover')
        batting_innings = team_innings_div.text.strip().replace('\xa0',' ')
            
        innings_list.append(batting_innings)
        
        batsmen_table = innings_table.find('table', class_='ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table')
    
        # Batsmen details extraction
        batting_info = batsmen_table.find_all('tr', class_='')
        batsmen_stats = batting_info[:len(batting_info)-2]

        for batter in batsmen_stats:
            batsman_stats = batter.find_all('td')
            name = batsman_stats[0].text.strip()
            dismissal = batsman_stats[1].text.strip()
            runs = batsman_stats[2].text.strip()
            balls = batsman_stats[3].text.strip()
            minutes = batsman_stats[4].text.strip()
            fours = batsman_stats[5].text.strip()
            sixes = batsman_stats[6].text.strip()
            strike_rate = batsman_stats[7].text.strip()
                    
            # Append the details to the list
            batsmen_stat = {'Innings Number': innings_number,
                'Innings Name': batting_innings,
                'Batsman': name,
                'Dismissal': dismissal,
                'Runs': runs,
                'Balls': balls,
                'Minutes': minutes,
                '4s': fours,
                '6s': sixes,
                'Strike Rate': strike_rate
            }
            batsmen_list = batsmen_list._append(batsmen_stat, ignore_index = True)
        
        #(b 8, lb 3, nb 10, w 5, pen 5)
        # # Extras extraction
        extras_row = batsmen_table.find('tr', class_='ds-text-tight-s')
        if extras_row:
            extras_breakdown = extras_row.find('td', class_='ds-min-w-max !ds-pl-[100px]').text.strip()
            extras_breakdown = extras_breakdown.split('(')[1].split(')')[0].split(', ')
            Byes = 0
            Leg_Byes = 0
            No_Balls = 0
            Wides = 0
            Penalty = 0
            for item in extras_breakdown:
                extra, value = item.split()  # Split into type and value
                value = int(value)  # Convert value to integer
                
                if extra == 'b':
                    Byes = value
                elif extra == 'lb':
                    Leg_Byes = value
                elif extra == 'nb':
                    No_Balls = value
                elif extra == 'w':
                    Wides = value
                elif extra == 'pen':
                    Penalty = value

            extras_total = extras_row.find('td', class_='ds-min-w-max ds-text-right').strong.text.strip()
            
            innings_extras = {'Innings Number': innings_number,
                'Innings Name': batting_innings,
                'Byes': Byes,
                'Leg Byes': Leg_Byes,
                'No Balls': No_Balls,
                'Wides': Wides,
                'Penalty': Penalty,
                'Total': extras_total
            }
            # Add to match extras list
            match_extras = match_extras._append(innings_extras, ignore_index = True)
        
        # Extract the total score, overs, and run rate
        total_row = batting_info[-2]  # The last 'tr' containing score info
        total_datapoints = total_row.find_all('td')
        if total_datapoints[0].text.strip() == "Total":
            score = total_datapoints[2].text.strip()
            other_info = total_datapoints[1].text.strip()
            overs = other_info.split(' Ov')[0]
            remaining = other_info.split('RR: ')[1].split(')')[0]
            if "Mins" not in remaining:
                run_rate = remaining
                mins = "-"
            else:
                run_rate = remaining.split(', ')[0]
                mins = remaining.split(', ')[1].split(" Mins")[0]                   
            # Add score, overs, and run rate to the match score list
            innings_score = {'Innings Number': innings_number,
                'Innings Name': batting_innings,
                'Score': score,
                'Overs': overs,
                'Run Rate': run_rate,
                "Minutes" : mins
            }
            match_score = match_score._append(innings_score, ignore_index = True)

        # Extract "Did Not Bat" information
        did_not_bat_row = batsmen_table.find('tr', class_='!ds-border-b-0')
        if did_not_bat_row:
            batsmen = did_not_bat_row.find_all('div', class_='ds-popper-wrapper ds-inline')
            for batsman in batsmen:
                batsman_name = (batsman.get_text(strip=True)).replace(",", "")
                did_not_bat = did_not_bat._append({'Innings Number': innings_number, 'Innings Name': batting_innings, 'Batsman':batsman_name}, ignore_index = True)

        # Extract Fall of Wickets information
        fall_of_wickets_row = batting_info[-1]
        wicket_spans = fall_of_wickets_row.find_all('span')
                
        for span in wicket_spans:
            wicket_data = span.get_text(strip=True)
            if '-' in wicket_data:
                # Extract wicket number and runs
                parts = wicket_data.split('-')
                wicket = parts[0].strip()  # Wicket number
                runs = parts[1].split('(')[0].strip()  # Runs at fall of wicket
                batsman = parts[1].split('(')[1].split(')')[0].strip()
                if " ov" in batsman:
                    final_batsman = batsman.split(", ")[0]
                    overs = batsman.split(", ")[1].split(' ov')[0]
                    fall_of_wickets = fall_of_wickets._append({'Innings Number': innings_number, 'Innings Name': batting_innings, 'Wicket': wicket, 'Runs' : runs, "Batsman" : final_batsman, "Overs" : overs}, ignore_index = True)          
                elif "retired not out" in batsman:
                    final_batsman = batsman.split(", ")[0]
                    overs = batsman.split(", ")[1].split(' ov')[0]
                    fall_of_wickets = fall_of_wickets._append({'Innings Number': innings_number, 'Innings Name': batting_innings, 'Wicket': wicket, 'Runs' : runs, "Batsman" : final_batsman, "Overs" : "-"}, ignore_index = True)
                else:
                    fall_of_wickets = fall_of_wickets._append({'Innings Number': innings_number, 'Innings Name': batting_innings, 'Wicket': wicket, 'Runs' : runs, "Batsman" : batsman, "Overs" : "-"}, ignore_index = True)

        #Extracting Bowling Info
        bowling_table = innings_table.find('table', class_='ds-w-full ds-table ds-table-md ds-table-auto')

        # Check if the bowling table is present
        if bowling_table:
            headers = []
            head_data = bowling_table.find_all('th')
            for bowler_header in head_data:
                headers.append(bowler_header.text.strip())
            headers = list(map(lambda x: "Bowler" if x == "Bowling" else x, headers))
            headers = list(map(lambda x: "Overs" if x == "O" else x, headers))
            headers = list(map(lambda x: "Maidens" if x == "M" else x, headers))
            headers = list(map(lambda x: "Runs" if x == "R" else x, headers))
            headers = list(map(lambda x: "Wickets" if x == "W" else x, headers))
            headers = list(map(lambda x: "Economy" if x == "ECON" else x, headers))
            headers = list(map(lambda x: "Wides" if x == "WD" else x, headers))
            headers = list(map(lambda x: "No Balls" if x == "NB" else x, headers))
            
            bowler_rows = bowling_table.find('tbody').find_all('tr',class_='')

            for row in bowler_rows:
                bowler_data = {}
                bowler_data['Innings Number'] = innings_number
                bowler_data['Innings Name'] = batting_innings
                bowler_row_data = row.find_all('td')

                for i in range(len(headers)):
                    bowler_data[headers[i]] = bowler_row_data[i].text.strip()        

                # Append the bowler's data to the list
                bowlers_info = bowlers_info._append(bowler_data, ignore_index = True)
    
    return innings_list, batsmen_list, match_extras, match_score, did_not_bat, fall_of_wickets, bowlers_info

### Printing the score

In [33]:
def printing_scorecard(innings_list, batsmen_list, match_extras, match_score, did_not_bat, fall_of_wickets, bowlers_info):
    for innings in innings_list:
        print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")  
        print(innings + ":")
        print()
    
        print("Batsmen:")
        display(batsmen_list[batsmen_list['Innings Name'] == innings].drop(columns=['Innings Number', 'Innings Name']).style.hide(axis="index").set_table_styles([
            {'selector': 'th', 'props': [('text-align', 'center')]},  # Center-align headers
            {'selector': 'td', 'props': [('text-align', 'center')]}   # Center-align data
        ]))
        
        if match_extras.empty == False:
            if match_extras[match_extras['Innings Name'] == innings].empty == False:
                print("Extras:")
                display(match_extras[match_extras['Innings Name'] == innings].drop(columns=['Innings Number', 'Innings Name']).style.hide(axis="index").set_table_styles([
                    {'selector': 'th', 'props': [('text-align', 'center')]},  # Center-align headers
                    {'selector': 'td', 'props': [('text-align', 'center')]}   # Center-align data
                ]))
        
        # Print Score Information
        print("Score:")
        display(match_score[match_score['Innings Name'] == innings].drop(columns=['Innings Number', 'Innings Name']).style.hide(axis="index").set_table_styles([
            {'selector': 'th', 'props': [('text-align', 'center')]},  # Center-align headers
            {'selector': 'td', 'props': [('text-align', 'center')]}   # Center-align data
        ]))
    
        # Print "Did Not Bat" information
        if did_not_bat.empty == False:
            if did_not_bat[did_not_bat['Innings Name'] == innings].empty == False:
                print("Did Not Bat:")
                display(did_not_bat[did_not_bat['Innings Name'] == innings].drop(columns=['Innings Number', 'Innings Name']).style.hide(axis="index").set_table_styles([
                {'selector': 'th', 'props': [('text-align', 'center')]},  # Center-align headers
                {'selector': 'td', 'props': [('text-align', 'center')]}   # Center-align data
            ]))
        
        if fall_of_wickets[fall_of_wickets['Innings Name'] == innings].empty == False:
            print("Fall of Wickets:")
            display(fall_of_wickets[fall_of_wickets['Innings Name'] == innings].drop(columns=['Innings Number', 'Innings Name']).style.hide(axis="index").set_table_styles([
            {'selector': 'th', 'props': [('text-align', 'center')]},  # Center-align headers
            {'selector': 'td', 'props': [('text-align', 'center')]}   # Center-align data
        ]))
        
        print("Bowlers:")
        display(bowlers_info[bowlers_info['Innings Name'] == innings].drop(columns=['Innings Number', 'Innings Name']).style.hide(axis="index").set_table_styles([
            {'selector': 'th', 'props': [('text-align', 'center')]},  # Center-align headers
            {'selector': 'td', 'props': [('text-align', 'center')]}   # Center-align data
        ]))


## Extracting DRS information

In [9]:
# Initialize WebDriver
driver = webdriver.Chrome()

# Navigate to the page
driver.get("https://www.espncricinfo.com/series/australia-in-sri-lanka-2024-25-1459900/sri-lanka-vs-australia-1st-test-1459906/full-scorecard")

# Click on the popup trigger
popup_trigger = driver.find_elements(By.XPATH, "//span[contains(@class, 'ds-inline-flex ds-items-center ds-cursor-pointer ds-border-b') and contains(., 'DRS')]")
drs_1 = popup_trigger[0]
drs_1.click()

# Wait for new content to load
time.sleep(3)  # Adjust as needed

# Get the full updated page HTML
page_html = driver.page_source

# Use BeautifulSoup to prettify the HTML
soup = BeautifulSoup(page_html, "html.parser")

drs_table = soup.find('table',class_="ds-w-full ds-table ds-table-sm ds-table-auto ds-overflow-scroll")

drs_head = drs_table.find('thead',class_="ds-bg-fill-content-alternate ds-text-left")

drs_headers = drs_head.find_all('th',class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max")

Header = []
for header in drs_headers:
    Header.append(header.get_text())

drs_body = drs_table.find("tbody",class_="")

drs_data = drs_body.find_all("tr",class_="")

DRS = pd.DataFrame(columns = Header)
for decision in drs_data:
    player = {}
    drs_decision = decision.find_all("td",class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max")
    final_decision = decision.find("td",class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-font-bold ds-text-alert-error")
    if final_decision is None:
        final_decision = decision.find("td",class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-font-bold ds-text-alert-typo-success")
    drs_decision.append(final_decision)
    
    for i in range(len(Header)):
        info = drs_decision[i]
        player[Header[i]] = info.get_text()

    player = pd.DataFrame([player])
    DRS = pd.concat([DRS,player],ignore_index = True)

display(DRS.style.hide(axis = "index"))

driver.quit()

Inns,Over,Decision,Review by,Batter,Bowler,Umpire,Original → DRS,Review result
AUS 1st,7.6,Wicket,Sri Lanka (Bowling),TM Head,KN Peiris,AT Holdstock,Not Out → Not Out,Unsuccessful
AUS 1st,19.6,Wicket,Sri Lanka (Bowling),M Labuschagne,KN Peiris,AT Holdstock,Not Out → Not Out,Unsuccessful (Umpire's call)
AUS 1st,99.5,Wicket,Sri Lanka (Bowling),SPD Smith,JDF Vandersay,AT Holdstock,Not Out → Out,Successful
AUS 1st,119.2,Wicket,Australia (Batting),JP Inglis,KN Peiris,AT Holdstock,Out → Not Out,Successful


### Actual DRS Code

In [25]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [28]:
# Initialize WebDriver
driver = webdriver.Chrome()

count = 0

# Navigate to the page
driver.get(url)

# Click on the popup trigger
popup = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-inline-flex ds-items-center ds-cursor-pointer ds-border-b') and contains(., 'DRS')]")
popup.click()

# Wait for new content to load
time.sleep(3)  # Adjust as needed

# Wait for the dropdown to be clickable
dropdown_button = WebDriverWait(driver, 3).until(
    EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '1st Innings')]//ancestor::div[contains(@class, 'ds-cursor-pointer')]"))
)
dropdown_button.click()

time.sleep(3)  # Adjust as needed

all_innings = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-grow') and contains(., 'All Innings')]")
all_innings.click()

#icon-expand_more-outlined ds-text-icon ds-ml-2
# Get the full updated page HTML
page_html = driver.page_source

# Use BeautifulSoup to prettify the HTML
soup = BeautifulSoup(page_html, "html.parser")

drs_table = soup.find('table',class_="ds-w-full ds-table ds-table-sm ds-table-auto ds-overflow-scroll")

drs_head = drs_table.find('thead',class_="ds-bg-fill-content-alternate ds-text-left")

drs_headers = drs_head.find_all('th',class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max")

Header = []
for header in drs_headers:
    Header.append(header.get_text())

drs_body = drs_table.find("tbody",class_="")

drs_data = drs_body.find_all("tr",class_="")

DRS = pd.DataFrame(columns = Header)
for decision in drs_data:
    player = {}
    drs_decision = decision.find_all("td",class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max")
    final_decision = decision.find("td",class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-font-bold ds-text-alert-error")
    if final_decision is None:
        final_decision = decision.find("td",class_="ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-font-bold ds-text-alert-typo-success")
    drs_decision.append(final_decision)
    
    for i in range(len(Header)):
        info = drs_decision[i]
        player[Header[i]] = info.get_text()

    player = pd.DataFrame([player])
    DRS = pd.concat([DRS,player],ignore_index = True)

display(DRS.style.hide(axis = "index"))

driver.quit()



Inns,Over,Decision,Review by,Batter,Bowler,Umpire,Original → DRS,Review result
AUS 1st,7.6,Wicket,Sri Lanka (Bowling),TM Head,KN Peiris,AT Holdstock,Not Out → Not Out,Unsuccessful
AUS 1st,19.6,Wicket,Sri Lanka (Bowling),M Labuschagne,KN Peiris,AT Holdstock,Not Out → Not Out,Unsuccessful (Umpire's call)
AUS 1st,99.5,Wicket,Sri Lanka (Bowling),SPD Smith,JDF Vandersay,AT Holdstock,Not Out → Out,Successful
AUS 1st,119.2,Wicket,Australia (Batting),JP Inglis,KN Peiris,AT Holdstock,Out → Not Out,Successful
SL 1st,1.6,Wicket,Sri Lanka (Batting),BOP Fernando,MP Kuhnemann,AT Holdstock,Out → Out,Unsuccessful
SL 1st,8.1,Wicket,Australia (Bowling),LD Chandimal,MP Kuhnemann,CB Gaffaney,Not Out → Not Out,Unsuccessful


<table class="ds-w-full ds-table ds-table-sm ds-table-auto ds-overflow-scroll">
        <thead class="ds-bg-fill-content-alternate ds-text-left">
         <tr class="">
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Inns
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Over
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Decision
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Review by
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Batter
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Bowler
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Umpire
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Original → DRS
          </th>
          <th class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Review result
          </th>
         </tr>
        </thead>
        <tbody class="">
         <tr class="">
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           AUS 1st
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           7.6
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Wicket
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Sri Lanka (Bowling)
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           TM Head
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           KN Peiris
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           AT Holdstock
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Not Out → Not Out
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-font-bold ds-text-alert-error">
           Unsuccessful
           <span class="ds-text-tight-s ds-font-regular ds-text-typo">
           </span>
          </td>
         </tr>
         <tr class="">
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           AUS 1st
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           19.6
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Wicket
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Sri Lanka (Bowling)
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           M Labuschagne
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           KN Peiris
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           AT Holdstock
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max">
           Not Out → Not Out
          </td>
          <td class="ds-w-0 ds-whitespace-nowrap ds-min-w-max ds-font-bold ds-text-alert-error">
           Unsuccessful
           <span class="ds-text-tight-s ds-font-regular ds-text-typo">
            (Umpire's call)
           </span>
          </td>
         </tr>
        </tbody>
       </table>


## Final Output

In [43]:
#match_url = input("Enter the match url: ")
#match_url = 'https://www.espncricinfo.com/series/australia-tour-of-south-africa-1935-36-61374/south-africa-vs-australia-2nd-test-62634/full-scorecard'
#match_url = "https://www.espncricinfo.com/series/south-africa-vs-sri-lanka-2024-25-1432203/south-africa-vs-sri-lanka-1st-test-1432209/full-scorecard"
#match_url = "https://www.espncricinfo.com/series/australia-in-sri-lanka-2024-25-1459900/sri-lanka-vs-australia-1st-test-1459906/full-scorecard"
#match_url = "https://www.espncricinfo.com/series/australia-vs-india-2024-25-1426547/australia-vs-india-5th-test-1426559/full-scorecard"
match_url = "https://www.espncricinfo.com/series/australia-in-new-zealand-2023-24-1388188/new-zealand-vs-australia-1st-test-1388226/full-scorecard"
#match_url = "https://www.espncricinfo.com/series/indian-premier-league-2024-1410320/kolkata-knight-riders-vs-sunrisers-hyderabad-final-1426312/full-scorecard"

begin = time.time() 
innings_list, batsmen_list, match_extras, match_score, did_not_bat, fall_of_wickets, bowlers_info = scorecard(match_url)

printing_scorecard(innings_list, batsmen_list, match_extras, match_score, did_not_bat, fall_of_wickets, bowlers_info)
end = time.time()
# total time taken 
total_time_taken = end-begin
minutes = str(int(total_time_taken/60))
seconds = str(round(total_time_taken % 60,3))
total_time_taken = minutes+"m "+seconds+"s"
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")  
print(f"Total runtime of the program is {total_time_taken}")             

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Australia 1st Innings:

Batsmen:


Batsman,Dismissal,Runs,Balls,Minutes,4s,6s,Strike Rate
Steven Smith,c †Blundell b Henry,31,71,106,4,0,43.66
Usman Khawaja,b Henry,33,118,181,3,1,27.96
Marnus Labuschagne,c Mitchell b Kuggeleijn,1,27,31,0,0,3.7
Cameron Green,not out,174,275,396,23,5,63.27
Travis Head,c †Blundell b O'Rourke,1,6,8,0,0,16.66
Mitchell Marsh,c †Blundell b Henry,40,39,59,6,1,102.56
Alex Carey †,c Williamson b Kuggeleijn,10,20,35,0,0,50.0
Mitchell Starc,c Latham b O'Rourke,9,33,47,1,0,27.27
Pat Cummins (c),lbw b Ravindra,16,24,30,0,2,66.66
Nathan Lyon,c †Blundell b Henry,5,19,24,1,0,26.31


Extras:


Byes,Leg Byes,No Balls,Wides,Penalty,Total
12,6,3,20,0,41


Score:


Score,Overs,Run Rate,Minutes
383,115.1,3.32,534


Fall of Wickets:


Wicket,Runs,Batsman,Overs
1,61,Steven Smith,24.1
2,65,Marnus Labuschagne,30.6
3,88,Usman Khawaja,40.2
4,89,Travis Head,41.3
5,156,Mitchell Marsh,54.2
6,176,Alex Carey,62.1
7,211,Mitchell Starc,71.2
8,244,Pat Cummins,78.5
9,267,Nathan Lyon,83.6
10,383,Josh Hazlewood,115.1


Bowlers:


Bowler,Overs,Maidens,Runs,Wickets,Economy,0s,4s,6s,Wides,No Balls
Tim Southee,27.0,4,92,0,3.4,125,13,2,0,2
Matt Henry,30.1,11,70,5,2.32,152,7,2,2,0
Will O’Rourke,27.0,10,87,2,3.22,136,11,2,3,0
Scott Kuggeleijn,20.0,1,75,2,3.75,86,8,1,3,1
Daryl Mitchell,4.0,0,17,0,4.25,15,2,0,0,0
Rachin Ravindra,7.0,1,24,1,3.42,31,1,2,0,0


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
New Zealand 1st Innings:

Batsmen:


Batsman,Dismissal,Runs,Balls,Minutes,4s,6s,Strike Rate
Tom Latham,b Starc,5,13,20,1,0,38.46
Will Young,c †Carey b Marsh,9,50,84,1,0,18.0
Kane Williamson,run out (Labuschagne),0,2,3,0,0,0.0
Rachin Ravindra,c Lyon b Hazlewood,0,3,3,0,0,0.0
Daryl Mitchell,c †Carey b Cummins,11,37,54,1,0,29.72
Tom Blundell †,c Head b Lyon,33,43,69,3,0,76.74
Glenn Phillips,c Starc b Hazlewood,71,70,109,13,0,101.42
Scott Kuggeleijn,c Green b Lyon,0,2,2,0,0,0.0
Matt Henry,c Labuschagne b Lyon,42,34,53,3,4,123.52
Tim Southee (c),c Head b Lyon,1,5,5,0,0,20.0


Extras:


Byes,Leg Byes,No Balls,Wides,Penalty,Total
1,3,2,1,0,7


Score:


Score,Overs,Run Rate,Minutes
179,43.1,4.14,208


Fall of Wickets:


Wicket,Runs,Batsman,Overs
1,12,Tom Latham,4.4
2,12,Kane Williamson,4.6
3,12,Rachin Ravindra,5.3
4,29,Daryl Mitchell,16.6
5,29,Will Young,17.1
6,113,Tom Blundell,31.3
7,113,Scott Kuggeleijn,31.5
8,161,Glenn Phillips,40.3
9,162,Tim Southee,41.4
10,179,Matt Henry,43.1


Bowlers:


Bowler,Overs,Maidens,Runs,Wickets,Economy,0s,4s,6s,Wides,No Balls
Mitchell Starc,9.0,4,34,1,3.77,41,6,0,1,0
Josh Hazlewood,12.0,0,55,2,4.58,47,7,1,0,0
Pat Cummins,10.0,2,33,1,3.3,44,4,0,0,0
Mitchell Marsh,4.0,0,10,1,2.5,21,1,0,0,2
Nathan Lyon,8.1,1,43,4,5.26,33,4,3,0,0


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Australia 2nd Innings:

Batsmen:


Batsman,Dismissal,Runs,Balls,Minutes,4s,6s,Strike Rate
Steven Smith,b Southee,0,3,1,0,0,0.0
Usman Khawaja,st †Blundell b Phillips,28,69,123,1,0,40.57
Marnus Labuschagne,c †Blundell b Southee,2,13,16,0,0,15.38
Nathan Lyon,c Young b Henry,41,46,51,6,0,89.13
Cameron Green,c Young b Phillips,34,80,135,3,1,42.5
Travis Head,c Kuggeleijn b Phillips,29,36,49,4,0,80.55
Mitchell Marsh,c Young b Phillips,0,1,1,0,0,0.0
Alex Carey †,c Southee b Phillips,3,9,18,0,0,33.33
Mitchell Starc,b Henry,12,27,42,2,0,44.44
Pat Cummins (c),c Latham b Henry,8,15,15,1,0,53.33


Extras:


Byes,Leg Byes,No Balls,Wides,Penalty,Total
5,1,0,0,0,6


Score:


Score,Overs,Run Rate,Minutes
164,51.1,3.2,235


Fall of Wickets:


Wicket,Runs,Batsman,Overs
1,0,Steven Smith,0.3
2,4,Marnus Labuschagne,4.4
3,53,Nathan Lyon,15.6
4,81,Usman Khawaja,26.5
5,127,Travis Head,38.2
6,127,Mitchell Marsh,38.3
7,139,Alex Carey,42.2
8,146,Cameron Green,44.2
9,159,Pat Cummins,47.5
10,164,Mitchell Starc,51.1


Bowlers:


Bowler,Overs,Maidens,Runs,Wickets,Economy,0s,4s,6s,Wides,No Balls
Tim Southee,11.1,2,46,2,4.11,49,8,0,0,0
Matt Henry,12.1,1,36,3,2.95,52,3,0,0,0
Scott Kuggeleijn,3.0,0,18,0,6.0,9,2,0,0,0
Will O’Rourke,7.5,4,11,0,1.4,41,1,0,0,0
Glenn Phillips,16.0,4,45,5,2.81,74,3,1,0,0
Rachin Ravindra,1.0,0,2,0,2.0,4,0,0,0,0


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
New Zealand 2nd Innings (T: 369 runs):

Batsmen:


Batsman,Dismissal,Runs,Balls,Minutes,4s,6s,Strike Rate
Tom Latham,c †Carey b Lyon,8,18,24,1,0,44.44
Will Young,c Smith b Head,15,52,95,2,0,28.84
Kane Williamson,c Smith b Lyon,9,19,30,2,0,47.36
Rachin Ravindra,c Green b Lyon,59,105,160,8,1,56.19
Daryl Mitchell,c & b Hazlewood,38,130,207,2,0,29.23
Tom Blundell †,c Head b Lyon,0,3,3,0,0,0.0
Glenn Phillips,lbw b Lyon,1,7,9,0,0,14.28
Scott Kuggeleijn,c †Carey b Green,26,28,33,4,0,92.85
Matt Henry,c Smith b Hazlewood,14,15,20,2,1,93.33
Tim Southee (c),c Starc b Lyon,7,7,6,0,1,100.0


Extras:


Byes,Leg Byes,No Balls,Wides,Penalty,Total
14,2,1,2,0,19


Score:


Score,Overs,Run Rate,Minutes
196,64.4,3.03,303


Fall of Wickets:


Wicket,Runs,Batsman,Overs
1,15,Tom Latham,5.2
2,35,Kane Williamson,11.1
3,59,Will Young,20.3
4,126,Rachin Ravindra,47.3
5,126,Tom Blundell,47.6
6,128,Glenn Phillips,49.4
7,164,Scott Kuggeleijn,56.3
8,187,Matt Henry,60.4
9,194,Tim Southee,61.5
10,196,Daryl Mitchell,64.4


Bowlers:


Bowler,Overs,Maidens,Runs,Wickets,Economy,0s,4s,6s,Wides,No Balls
Mitchell Starc,9.0,3,29,0,3.22,42,4,0,1,0
Josh Hazlewood,9.4,2,20,2,2.06,46,2,0,0,0
Nathan Lyon,27.0,8,65,6,2.4,132,7,2,0,0
Pat Cummins,12.0,1,40,0,3.33,51,5,0,0,1
Travis Head,4.0,1,10,1,2.5,19,0,1,0,0
Cameron Green,3.0,0,16,1,5.33,13,3,0,1,0


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Total runtime of the program is 0m 0.752s


In [44]:
print(innings_list)

['Australia 1st Innings', 'New Zealand 1st Innings', 'Australia 2nd Innings', 'New Zealand 2nd Innings (T: 369 runs)']


In [45]:
display(batsmen_list)
# batsmen_list.loc[batsmen_list['Innings Number'] == c]

Unnamed: 0,Innings Number,Innings Name,Batsman,Dismissal,Runs,Balls,Minutes,4s,6s,Strike Rate
0,1,Australia 1st Innings,Steven Smith,c †Blundell b Henry,31,71,106,4,0,43.66
1,1,Australia 1st Innings,Usman Khawaja,b Henry,33,118,181,3,1,27.96
2,1,Australia 1st Innings,Marnus Labuschagne,c Mitchell b Kuggeleijn,1,27,31,0,0,3.7
3,1,Australia 1st Innings,Cameron Green,not out,174,275,396,23,5,63.27
4,1,Australia 1st Innings,Travis Head,c †Blundell b O'Rourke,1,6,8,0,0,16.66
5,1,Australia 1st Innings,Mitchell Marsh,c †Blundell b Henry,40,39,59,6,1,102.56
6,1,Australia 1st Innings,Alex Carey †,c Williamson b Kuggeleijn,10,20,35,0,0,50.0
7,1,Australia 1st Innings,Mitchell Starc,c Latham b O'Rourke,9,33,47,1,0,27.27
8,1,Australia 1st Innings,Pat Cummins (c),lbw b Ravindra,16,24,30,0,2,66.66
9,1,Australia 1st Innings,Nathan Lyon,c †Blundell b Henry,5,19,24,1,0,26.31


In [46]:
display(match_extras)

Unnamed: 0,Innings Number,Innings Name,Byes,Leg Byes,No Balls,Wides,Penalty,Total
0,1,Australia 1st Innings,12,6,3,20,0,41
1,2,New Zealand 1st Innings,1,3,2,1,0,7
2,3,Australia 2nd Innings,5,1,0,0,0,6
3,4,New Zealand 2nd Innings (T: 369 runs),14,2,1,2,0,19


In [47]:
display(match_score)

Unnamed: 0,Innings Number,Innings Name,Score,Overs,Run Rate,Minutes
0,1,Australia 1st Innings,383,115.1,3.32,534
1,2,New Zealand 1st Innings,179,43.1,4.14,208
2,3,Australia 2nd Innings,164,51.1,3.2,235
3,4,New Zealand 2nd Innings (T: 369 runs),196,64.4,3.03,303


In [48]:
display(did_not_bat)

In [72]:
display(fall_of_wickets)

Unnamed: 0,Innings Number,Innings Name,Wicket,Runs,Batsman,Overs
0,1,Australia 1st Innings,1,61,Steven Smith,24.1
1,1,Australia 1st Innings,2,65,Marnus Labuschagne,30.6
2,1,Australia 1st Innings,3,88,Usman Khawaja,40.2
3,1,Australia 1st Innings,4,89,Travis Head,41.3
4,1,Australia 1st Innings,5,156,Mitchell Marsh,54.2
5,1,Australia 1st Innings,6,176,Alex Carey,62.1
6,1,Australia 1st Innings,7,211,Mitchell Starc,71.2
7,1,Australia 1st Innings,8,244,Pat Cummins,78.5
8,1,Australia 1st Innings,9,267,Nathan Lyon,83.6
9,1,Australia 1st Innings,10,383,Josh Hazlewood,115.1


In [76]:
display(bowlers_info)

Unnamed: 0,Innings Number,Innings Name,Bowler,Overs,Maidens,Runs,Wickets,Economy,0s,4s,6s,Wides,No Balls
0,1,Australia 1st Innings,Tim Southee,27.0,4,92,0,3.4,125,13,2,0,2
1,1,Australia 1st Innings,Matt Henry,30.1,11,70,5,2.32,152,7,2,2,0
2,1,Australia 1st Innings,Will O’Rourke,27.0,10,87,2,3.22,136,11,2,3,0
3,1,Australia 1st Innings,Scott Kuggeleijn,20.0,1,75,2,3.75,86,8,1,3,1
4,1,Australia 1st Innings,Daryl Mitchell,4.0,0,17,0,4.25,15,2,0,0,0
5,1,Australia 1st Innings,Rachin Ravindra,7.0,1,24,1,3.42,31,1,2,0,0
6,2,New Zealand 1st Innings,Mitchell Starc,9.0,4,34,1,3.77,41,6,0,1,0
7,2,New Zealand 1st Innings,Josh Hazlewood,12.0,0,55,2,4.58,47,7,1,0,0
8,2,New Zealand 1st Innings,Pat Cummins,10.0,2,33,1,3.3,44,4,0,0,0
9,2,New Zealand 1st Innings,Mitchell Marsh,4.0,0,10,1,2.5,21,1,0,0,2


## Playing Around

In [41]:
runs = batsmen_list.loc[batsmen_list['Batsman'] == 'Mushfiqur Rahim', 'Runs'].astype(int).tolist()
print(runs)
print(sum(runs))

[]
0


In [61]:
def batsman_runs_list(batsman,countryyy):
    country_index = country.index(countryyy)
    runs_list = []
    did_not_bat_list = []
    for match_url in country_wise[country_index]:
        print("Match:",match_url)
        innings_list, batsmen_list, match_extras, match_score, did_not_bat, fall_of_wickets, bowlers_info = scorecard(match_url)
        runs_match = batsmen_list.loc[batsmen_list['Batsman'] == batsman, 'Runs']
        if runs_match.empty:
            runs_match = 0
        else:
            runs_match = runs_match.tolist()
            if runs_match[0] == "-":
                runs_match = 0
            else:
                runs_match = int(runs_match[0])
        print("Runs in this match:",runs_match)
        if did_not_bat.empty:
            did_not_bat_match = []
        else:
            did_not_bat_match = did_not_bat.loc[did_not_bat['Batsman'] == batsman, 'Batsman'].tolist()
        if did_not_bat_match:
            did_not_bat_match = "Yes"
        else:
            did_not_bat_match = "No"
        print("Did He Bat:",did_not_bat_match)
        runs_list.append(runs_match)
        did_not_bat_list.append(did_not_bat_match)
        # print(runs_match)
        # print(did_not_bat_match)
    return runs_list, did_not_bat_list

In [63]:
runs_list,did_not_bat_list = batsman_runs_list("Steve Smith", 'australia')
sum_of_runs = 0
for i in runs_list:
    sum_of_runs += sum(i)
print("Final Total:",sum_of_runs)
print("List of runs overall:",runs_list)

Match: https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-2nd-test-62397/full-scorecard
Runs in this match: 0
Did He Bat: []
Match: https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-1st-test-62396/full-scorecard
Runs in this match: 0
Did He Bat: []
Match: https://www.espncricinfo.com/series/england-tour-of-australia-1878-79-61733/australia-vs-england-only-test-62398/full-scorecard
Runs in this match: 0
Did He Bat: []
Match: https://www.espncricinfo.com/series/australia-tour-of-england-1880-61336/england-vs-australia-only-test-62399/full-scorecard
Runs in this match: 0
Did He Bat: []
Match: https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-4th-test-62403/full-scorecard
Runs in this match: 0
Did He Bat: []
Match: https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-3rd-test-62402/full-scorecard
Runs in this

UnboundLocalError: cannot access local variable 'minutes' where it is not associated with a value

In [None]:
number_of_matches = 0
for i in range(len(runs_list)):
    if len(runs_list[i])>0:
        number_of_matches += 1
    elif len(did_not_bat_list[i]) >0:
        number_of_matches+=1

print(number_of_matches)

# Getting a list of international players

In [14]:
parent_url = "https://www.espncricinfo.com/cricketers"

driver = webdriver.Chrome()

# Navigate to the page
driver.get(parent_url)

page_html = driver.page_source
soup = BeautifulSoup(page_html,'html.parser')
#driver.quit()
# print(soup.prettify())
player_div = soup.find_all('div',class_='ds-flex ds-p-4 ds-flex-row ds-space-x-4 ds-border-line ds-border-b odd:ds-border-r last:ds-border-none')

for player in player_div:
    player_url = player.find('a',class_='ds-inline-flex ds-items-start ds-leading-none')['href']
    player_url = "https://www.espncricinfo.com"+player_url
    player_id = player_url.split('-')[-1]
    player_name = player.find('span',class_='ds-text-tight-l').text.strip()
    print(player_url,player_id,player_name)

https://www.espncricinfo.com/cricketers/abhishek-sharma-1070183 1070183 Abhishek Sharma
https://www.espncricinfo.com/cricketers/khaleel-ahmed-942645 942645 Khaleel Ahmed
https://www.espncricinfo.com/cricketers/akash-deep-1176959 1176959 Akash Deep
https://www.espncricinfo.com/cricketers/arshdeep-singh-1125976 1125976 Arshdeep Singh
https://www.espncricinfo.com/cricketers/ravichandran-ashwin-26421 26421 Ravichandran Ashwin
https://www.espncricinfo.com/cricketers/avesh-khan-694211 694211 Avesh Khan
https://www.espncricinfo.com/cricketers/yastika-bhatia-960715 960715 Yastika Bhatia
https://www.espncricinfo.com/cricketers/jasprit-bumrah-625383 625383 Jasprit Bumrah
https://www.espncricinfo.com/cricketers/uma-chetry-960695 960695 Uma  Chetry
https://www.espncricinfo.com/cricketers/harleen-deol-960845 960845 Harleen Deol
https://www.espncricinfo.com/cricketers/tushar-deshpande-822553 822553 Tushar Deshpande
https://www.espncricinfo.com/cricketers/shivam-dube-714451 714451 Shivam Dube
https:/

# Code to Excel Database

### Old Method

In [46]:
import os
import pandas as pd
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# SCOPES for Google Drive and Google Sheets access
SCOPES = ['https://www.googleapis.com/auth/drive.file', 'https://www.googleapis.com/auth/spreadsheets']

def authenticate_google():
    """Authenticate and return Google API credentials."""
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return creds

def save_dataframes_to_excel(d1, d2, d3, file_path):
    """Save the dataframes to an Excel file with multiple sheets."""
    with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
        d1.to_excel(writer, sheet_name='Sheet1', index=False)
        d2.to_excel(writer, sheet_name='Sheet2', index=False)
        d3.to_excel(writer, sheet_name='Sheet3', index=False)
    print(f"Excel file saved at {file_path}")

def upload_file_to_drive(file_path, file_name):
    """Upload an Excel file to Google Drive and return the file ID."""
    creds = authenticate_google()
    service = build('drive', 'v3', credentials=creds)

    # Upload the Excel file to Google Drive
    file_metadata = {'name': file_name, 'mimeType': 'application/vnd.google-apps.spreadsheet'}
    media = MediaFileUpload(file_path, mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
    
    uploaded_file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    print(f"Uploaded file ID: {uploaded_file.get('id')}")
    return uploaded_file.get('id')

def get_google_sheet(sheet_id, sheet_name):
    """Access the Google Sheet and print data from the specified sheet."""
    creds = authenticate_google()
    service = build('sheets', 'v4', credentials=creds)
    sheet = service.spreadsheets()

    # Get the data from the specified sheet
    result = sheet.values().get(spreadsheetId=sheet_id, range=sheet_name).execute()
    values = result.get('values', [])
    
    if not values:
        print(f"No data found in sheet {sheet_name}")
    else:
        for row in values:
            print(row)

def upload_dataframes_and_access(d1, d2, d3, file_path, file_name, sheet_name):
    """Save dataframes to Excel, upload to Google Drive, and access the Google Sheet."""
    # Step 1: Save DataFrames to Excel file
    save_dataframes_to_excel(d1, d2, d3, file_path)
    
    # Step 2: Upload Excel file to Google Drive
    sheet_id = upload_file_to_drive(file_path, file_name)
    
    # Step 3: Access the Google Sheet
    get_google_sheet(sheet_id, sheet_name)

# Example DataFrames
d1 = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [25, 30]})
d2 = pd.DataFrame({'Product': ['Pen', 'Pencil'], 'Price': [10, 5]})
d3 = pd.DataFrame({'City': ['New York', 'London'], 'Population': [8000000, 9000000]})

# Example usage
upload_dataframes_and_access(d1, d2, d3, 'example_data.xlsx', 'UploadedData', 'Sheet1')


Excel file saved at example_data.xlsx


FileNotFoundError: [Errno 2] No such file or directory: 'credentials.json'

### New Method

In [None]:
pip install gspread pandas google-auth

In [14]:
import gspread
import pandas as pd
from google.oauth2.service_account import Credentials

# Step 1: Define your DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

# Step 2: Set up Google Sheets API credentials
SCOPES = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive']
SERVICE_ACCOUNT_FILE = r"C:/Users/Nisarg/Downloads/credentials.json"  # Path to your JSON key file

creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

# Step 3: Create a new spreadsheet
spreadsheet_name = 'My New Spreadsheet'  # Name of the new spreadsheet
spreadsheet = client.create(spreadsheet_name)

# Step 4: Share the spreadsheet with your email (optional)
# Replace 'your-email@example.com' with your email address
spreadsheet.share('cricket@reliable-proton-449504-g2.iam.gserviceaccount.com', perm_type='user', role='writer')

# Step 5: Open the first sheet of the new spreadsheet
sheet = spreadsheet.sheet1

# Step 6: Clear existing data (optional)
sheet.clear()

# Step 7: Update the sheet with the DataFrame
sheet.update([df.columns.values.tolist()] + df.values.tolist())

print(f"New spreadsheet created: {spreadsheet_name}")
print(f"Spreadsheet URL: {spreadsheet.url}")

PermissionError: 

#### Putting it in simple excel files on computer

In [None]:
def save_dataframes_to_excel(dataframes, folder_path, base_filename):
    """Saves multiple DataFrames into separate sheets of an Excel workbook."""
    # Create the folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)

    # Create an Excel writer
    file_path = os.path.join(folder_path, f"{base_filename}.xlsx")
    with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
        for sheet_name, df in dataframes.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Saved: {file_path}")

# Example DataFrames
df1 = pd.DataFrame({'Fruits': ['Appple', 'Banana', 'Mango',
                                       'Dragon Fruit', 'Musk melon', 'grapes'],
                            'Sales in kg': [20, 30, 15, 10, 50, 40]})
 
# create  data_frame2 by creating a dictionary 
# in which values are stored as list
df2 = pd.DataFrame({'Vegetables': ['tomato', 'Onion', 'ladies finger',
                                           'beans', 'bedroot', 'carrot'],
                            'Sales in kg': [200, 310, 115, 110, 55, 45]})
 
# create  data_frame3 by creating a dictionary 
# in which values are stored as list
df3 = pd.DataFrame({'Baked Items': ['Cakes', 'biscuits', 'muffins',
                                            'Rusk', 'puffs', 'cupcakes'],
                            'Sales in kg': [120, 130, 159, 310, 150, 140]})

# List of DataFrames with their corresponding sheet names
dataframes_list = [
    ({"Fruits": df1, "Vegetables": df2, "Baked Items": df3}, "Workbook1")
]

data_frame1.to_excel(writer, sheet_name="Fruits", index=False)
data_frame2.to_excel(writer, sheet_name="Vegetables", index=False)
data_frame3.to_excel(writer, sheet_name="Baked Items", index=False)

# Specify the folder where the workbooks will be saved
folder_name = "ExcelWorkbooks"
folder_path = os.path.join(os.getcwd(), folder_name)

# Loop through the list and save each set of DataFrames in a separate workbook
for dataframes, workbook_name in dataframes_list:
    save_dataframes_to_excel(dataframes, folder_path, workbook_name)


In [None]:
def load_sheet_from_excel(folder_path, workbook_name, sheet_name):
    """Loads a specific sheet from an Excel workbook."""
    # Construct the full file path
    file_path = os.path.join(folder_path, f"{workbook_name}.xlsx")
    
    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    # Load the specified sheet into a DataFrame
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        return df
    except ValueError as e:
        raise ValueError(f"Error loading sheet '{sheet_name}': {e}")

# Specify the folder and workbook details
folder_name = "ExcelWorkbooks"
folder_path = os.path.join(os.getcwd(), folder_name)
workbook_name = "Workbook1"  # Change as necessary
sheet_name = "Vegetables"        # Change to the sheet you want to access

# Example usage
try:
    df = load_sheet_from_excel(folder_path, workbook_name, sheet_name)
    print("DataFrame Loaded Successfully:")
    print(df)
    print(folder_path)
except (FileNotFoundError, ValueError) as e:
    print(e)


In [None]:
pip install pandas openpyxl google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client

# SQL

## Testing Code

In [1]:
import sqlite3
import pandas as pd

# Define Database Name
DB_NAME = "cirkut.sqlite"

# Connect to SQLite Database
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

# 🔹 CREATE TABLES
cursor.execute('''
CREATE TABLE IF NOT EXISTS matches (
    match_id INTEGER PRIMARY KEY,
    team1 TEXT,
    team2 TEXT,
    venue TEXT,
    date TEXT,
    result TEXT
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS innings_list (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    innings_number INTEGER,
    team TEXT,
    total_runs INTEGER,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS batsmen_list (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    batsman TEXT,
    runs INTEGER,
    balls INTEGER,
    strike_rate REAL,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS match_extras (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    team TEXT,
    wides INTEGER,
    no_balls INTEGER,
    byes INTEGER,
    leg_byes INTEGER,
    penalty_runs INTEGER,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS match_score (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    team TEXT,
    total_runs INTEGER,
    wickets INTEGER,
    overs FLOAT,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS did_not_bat (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    team TEXT,
    player TEXT,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS fall_of_wickets (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    team TEXT,
    wicket_number INTEGER,
    batsman TEXT,
    score_at_fall INTEGER,
    over TEXT,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS bowlers_info (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    bowler TEXT,
    overs FLOAT,
    maidens INTEGER,
    runs_given INTEGER,
    wickets INTEGER,
    economy REAL,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
''')

conn.commit()

# 🔹 FUNCTION TO INSERT DATAFRAMES INTO SQL
def insert_data(table_name, df, conn):
    """Inserts a Pandas DataFrame into an SQLite table."""
    df.to_sql(table_name, conn, if_exists='append', index=False)

✅ Data successfully stored in SQL database!

🔎 Fetching all matches:


match_id,team1,team2,venue,date,result
1,India,Pakistan,Delhi,2024-01-15,India won
2,Australia,England,Melbourne,2024-01-20,Australia won



🔎 Fetching all batsmen who scored more than 100:


id,match_id,batsman,runs,balls,strike_rate
1,1,Virat Kohli,120,150,80.0
3,2,Steve Smith,140,180,77.8


🔌 Database connection closed.


In [None]:
# 🔹 SAMPLE DATAFRAMES
df_matches = pd.DataFrame({
    'match_id': [1, 2],
    'team1': ['India', 'Australia'],
    'team2': ['Pakistan', 'England'],
    'venue': ['Delhi', 'Melbourne'],
    'date': ['2024-01-15', '2024-01-20'],
    'result': ['India won', 'Australia won']
})

df_innings = pd.DataFrame({
    'match_id': [1, 1, 2, 2],
    'innings_number': [1, 2, 1, 2],
    'team': ['India', 'Pakistan', 'Australia', 'England'],
    'total_runs': [350, 275, 420, 310]
})

df_batsmen = pd.DataFrame({
    'match_id': [1, 1, 2, 2],
    'batsman': ['Virat Kohli', 'Babar Azam', 'Steve Smith', 'Joe Root'],
    'runs': [120, 85, 140, 95],
    'balls': [150, 110, 180, 130],
    'strike_rate': [80.0, 77.3, 77.8, 73.1]
})

df_match_extras = pd.DataFrame({
    'match_id': [1, 1, 2, 2],
    'team': ['India', 'Pakistan', 'Australia', 'England'],
    'wides': [3, 5, 2, 4],
    'no_balls': [1, 0, 2, 1],
    'byes': [2, 3, 1, 4],
    'leg_byes': [5, 6, 3, 2],
    'penalty_runs': [0, 0, 0, 0]
})

df_match_score = pd.DataFrame({
    'match_id': [1, 2],
    'team': ['India', 'Australia'],
    'total_runs': [350, 420],
    'wickets': [8, 9],
    'overs': [50.0, 49.5]
})

df_fall_of_wickets = pd.DataFrame({
    'match_id': [1, 1, 2, 2],
    'team': ['India', 'Pakistan', 'Australia', 'England'],
    'wicket_number': [1, 2, 1, 2],
    'batsman': ['Rohit Sharma', 'Babar Azam', 'David Warner', 'Ben Stokes'],
    'score_at_fall': [25, 80, 50, 120],
    'over': ['5.2', '12.3', '10.1', '15.4']
})

df_bowlers = pd.DataFrame({
    'match_id': [1, 1, 2, 2],
    'bowler': ['Shaheen Afridi', 'Jasprit Bumrah', 'Pat Cummins', 'James Anderson'],
    'overs': [10, 10, 9.5, 10],
    'maidens': [2, 1, 3, 2],
    'runs_given': [45, 50, 38, 42],
    'wickets': [2, 3, 4, 3],
    'economy': [4.5, 5.0, 3.8, 4.2]
})

# 🔹 STORE DATA IN SQL
insert_data("matches", df_matches, conn)
insert_data("innings_list", df_innings, conn)
insert_data("batsmen_list", df_batsmen, conn)
insert_data("match_extras", df_match_extras, conn)
insert_data("match_score", df_match_score, conn)
insert_data("fall_of_wickets", df_fall_of_wickets, conn)
insert_data("bowlers_info", df_bowlers)

print("✅ Data successfully stored in SQL database!")

# Close Connection
conn.close()
print("🔌 Database connection closed.")

In [53]:
# Define Database Name
DB_NAME = "cirkut.sqlite"

# Connect to SQLite Database
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

df_matches = pd.DataFrame({
    'match_id': [3, 4],
    'team1': ['India', 'Australia'],
    'team2': ['Pakistan', 'England'],
    'venue': ['Delhi', 'Melbourne'],
    'date': ['2024-01-15', '2024-01-20'],
    'result': ['India won', 'Australia won']
})

df_bowlers = pd.DataFrame({
    'match_id': [3, 3, 4, 4],
    'bowler': ['Afridi', 'Bumrah', 'Cummins', 'Anderson'],
    'overs': [8, 7, 9, 1],
    'maidens': [2, 1, 3, 2],
    'runs_given': [45, 50, 38, 42],
    'wickets': [2, 3, 4, 3],
    'economy': [4.5, 5.0, 3.8, 4.2]
})

insert_data("bowlers_info", df_bowlers)

print("✅ Data successfully stored in SQL database!")

# Close Connection
conn.close()
print("🔌 Database connection closed.")

🔌 Database connection closed.


In [54]:
# Define Database Name
DB_NAME = "cirkut.sqlite"

# Connect to SQLite Database
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

# 🔹 QUERY EXAMPLES
print("\n🔎 Fetching all matches:")
df_results = pd.read_sql("SELECT * FROM matches", conn)
display(df_results.style.hide(axis = "index"))

print("\n🔎 Fetching all batsmen who scored more than 100:")
df_centuries = pd.read_sql("SELECT * FROM batsmen_list WHERE runs >= 100", conn)
display(df_centuries.style.hide(axis = "index"))

print("All bowlers:")
bowl = pd.read_sql("SELECT * FROM bowlers_info", conn)
display(bowl.style.hide(axis = "index"))


🔎 Fetching all matches:


match_id,team1,team2,venue,date,result
1,India,Pakistan,Delhi,2024-01-15,India won
2,Australia,England,Melbourne,2024-01-20,Australia won



🔎 Fetching all batsmen who scored more than 100:


id,match_id,batsman,runs,balls,strike_rate
1,1,Virat Kohli,120,150,80.0
3,2,Steve Smith,140,180,77.8


All bowlers:


id,match_id,bowler,overs,maidens,runs_given,wickets,economy
1,1,Shaheen Afridi,10.0,2,45,2,4.5
2,1,Jasprit Bumrah,10.0,1,50,3,5.0
3,2,Pat Cummins,9.5,3,38,4,3.8
4,2,James Anderson,10.0,2,42,3,4.2
5,3,Afridi,8.0,2,45,2,4.5
6,3,Bumrah,7.0,1,50,3,5.0
7,4,Cummins,9.0,3,38,4,3.8
8,4,Anderson,1.0,2,42,3,4.2


🔌 Database connection closed.


## Where the Actual Code Starts

In [55]:
pip install sqlitecloud

Collecting sqlitecloudNote: you may need to restart the kernel to use updated packages.

  Downloading sqlitecloud-0.0.83-py3-none-any.whl.metadata (8.9 kB)
Downloading sqlitecloud-0.0.83-py3-none-any.whl (48 kB)
   ---------------------------------------- 0.0/48.6 kB ? eta -:--:--
   -------- ------------------------------- 10.2/48.6 kB ? eta -:--:--
   ---------------------------------------- 48.6/48.6 kB 833.9 kB/s eta 0:00:00
Installing collected packages: sqlitecloud
Successfully installed sqlitecloud-0.0.83


In [2]:
import sqlitecloud
import pandas as pd

### Open connection

In [3]:
def connect_to_cloud(cloud_url):
    conn = sqlitecloud.connect(cloud_url)
    print("✅ Successfully connected to SQL database!")
    return conn

In [9]:
#cloud_url = "sqlitecloud://ctn9jxmfnk.g6.sqlite.cloud:8860/chinook.sqlite?apikey=3N2b0OKLtR6a9a5w5dnldrptWvbyBazKLsj0zVOGTpk"
cloud_url = "sqlitecloud://ctn9jxmfnk.g6.sqlite.cloud:8860/cirkut.sqlite?apikey=3N2b0OKLtR6a9a5w5dnldrptWvbyBazKLsj0zVOGTpk"
conn = connect_to_cloud(cloud_url)

✅ Successfully connected to SQL database!


### Access table from cloud

In [22]:
def sql_command(conn,command):
    cursor = conn.execute(command)
    result = cursor.fetchall()
    if cursor.description:
        description = [description[0] for description in cursor.description]
        # Convert the result to a pandas DataFrame
        result_df = pd.DataFrame(result, columns=description)
        print("🔎 Fetching Data:")
        return result_df
    else:
        print("🔎 Data Processed Successfully :")
        return(pd.DataFrame())

In [85]:
command = 'SELECT * FROM matches;'
result_df = sql_command(conn,command)
# Display the result as a DataFrame
display(result_df.style.hide(axis="index"))

match_id,team1,team2,venue,date,result
1,India,Pakistan,Delhi,2024-01-15,India won
2,Australia,England,Melbourne,2024-01-20,Australia won
5,Ind,Pak,Del,2024-01-15,India wo
7,Aust,Eng,Melb,2024-01-20,Australia wo


In [23]:
command = '''
CREATE TABLE IF NOT EXISTS bow_info (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    match_id INTEGER,
    bowler TEXT,
    overs FLOAT,
    maidens INTEGER,
    runs_given INTEGER,
    wickets INTEGER,
    economy REAL,
    FOREIGN KEY (match_id) REFERENCES matches (match_id)
)
'''
result_df = sql_command(conn,command)
# Display the result as a DataFrame
display(result_df.style.hide(axis="index"))

🔎 Uploading Data:


### Upload table values onto cloud

In [5]:
def upload_to_cloud(conn,data,destination_table):
    cursor = conn.cursor()

    # Generate column definitions dynamically based on DataFrame
    columns = ", ".join([f"{col} TEXT" for col in data.columns])  # Change to INTEGER, REAL, etc., as needed

    # Create the table if it doesn't exist
    create_table_query = f"CREATE TABLE IF NOT EXISTS {destination_table} ({columns});"
    cursor.execute(create_table_query)

    # Insert DataFrame rows into the table
    for _, row in data.iterrows():
        placeholders = ", ".join(["?" for _ in row])
        insert_query = f"INSERT INTO {destination_table} VALUES ({placeholders});"
        cursor.execute(insert_query, tuple(row))

    # Commit changes and close connection
    conn.commit()

    print(f"Data successfully uploaded to {destination_table} in SQLite Cloud.")

In [6]:
# Sample DataFrame (Replace this with your actual DataFrame)
data = pd.DataFrame({
    'match_id': [5, 7],
    'team1': ['Ind', 'Aust'],
    'team2': ['Pak', 'Eng'],
    'venue': ['Del', 'Melb'],
    'date': ['2024-01-15', '2024-01-20'],
    'result': ['India wo', 'Australia wo']
})
df = pd.DataFrame(data)

# Table name (Modify this as needed)
TABLE_NAME = "matches"

upload_to_cloud(conn,df,TABLE_NAME)

SQLiteCloudIntegrityError: UNIQUE constraint failed: matches.match_id

### Disconnect

In [7]:
# Close Connection
def close_connection(conn):
    conn.close()
    print("🔌 Database connection closed.")

In [8]:
close_connection(conn)

🔌 Database connection closed.


# Match Links by Year



#### Function to generate year ka links

In [49]:
def year_link_generator(current_year):
    year_link_list = []
    for year in range(1877,current_year+1):
        try:
            year_link = "https://www.espncricinfo.com/records/year/team-match-results/"+str(year)+"-"+str(year)+"/test-matches-1"
            #print(year_link)
            year_link_list.append(year_link)
        except:
            print("No tests played in",year)
    return year_link_list

#### Checking if it's working by printing output

In [68]:
year_links = year_link_generator(2025)
print(year_links)

['https://www.espncricinfo.com/records/year/team-match-results/1877-1877/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1878-1878/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1879-1879/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1880-1880/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1881-1881/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1882-1882/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1883-1883/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1884-1884/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1885-1885/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1886-1886/test-matches-1', 'https://www.espncricinfo.com/records/year/team-match-results/1887-1887/test-matches-1', 'https://www.espncri

#### Function to get matches of that year

In [32]:
def yearly_match_link_generator(year_link):
    match_links = []
    response = requests.get(year_link)
    soup = BeautifulSoup(response.content,'html.parser')
    #print(soup.prettify())
    tables = soup.find_all('div',class_='ds-overflow-x-auto ds-scrollbar-hide')
    for table in tables:
        matches = table.find_all('a', class_="ds-inline-flex ds-items-start ds-leading-none")
        for match in matches:
            if "Test # " in match['title']:
                match_link = "https://www.espncricinfo.com" + match['href']
                match_links.insert(0,match_link)
                #print(match_link)
    return match_links

#### Checking if it works

In [70]:
year1 = year_links[0]
print(yearly_match_link_generator(year1))

['https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-1st-test-62396/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-2nd-test-62397/full-scorecard']


#### Function to put all match links into a dictionary

In [33]:
def final_match_link_generator():
    match_links = {}
    year_links = year_link_generator(2025)
    for year in year_links:
        matches = yearly_match_link_generator(year)
        year = (year.split('/')[-2]).split('-')[-1]
        match_links[year] = matches
    return match_links

In [34]:
match_links = final_match_link_generator()
print(match_links)

{'1877': ['https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-1st-test-62396/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-2nd-test-62397/full-scorecard'], '1878': [], '1879': ['https://www.espncricinfo.com/series/england-tour-of-australia-1878-79-61733/australia-vs-england-only-test-62398/full-scorecard'], '1880': ['https://www.espncricinfo.com/series/australia-tour-of-england-1880-61336/england-vs-australia-only-test-62399/full-scorecard'], '1881': ['https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-1st-test-62400/full-scorecard'], '1882': ['https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-2nd-test-62401/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-3rd-test-62402/full-scorecard', 'https://www.espncricinfo.com

#### Getting a list of all year keys

In [35]:
year_list = list(match_links.keys())
print(year_list)

['1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',

#### Checking which years dont have any matches

In [74]:
for year in year_list:
    if len(match_links[year]) == 0:
        print(year)

1878
1891
1900
1915
1916
1917
1918
1919
1940
1941
1942
1943
1944
1945


#### Getting all match links into one final list

In [36]:
final_match_list = []
for year in year_list:
    for match in match_links[year]:
        final_match_list.append(match)

print(final_match_list)
print(len(final_match_list))

['https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-1st-test-62396/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1876-77-61716/australia-vs-england-2nd-test-62397/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1878-79-61733/australia-vs-england-only-test-62398/full-scorecard', 'https://www.espncricinfo.com/series/australia-tour-of-england-1880-61336/england-vs-australia-only-test-62399/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-1st-test-62400/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-2nd-test-62401/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-england-3rd-test-62402/full-scorecard', 'https://www.espncricinfo.com/series/england-tour-of-australia-1881-82-61738/australia-vs-

#### Checking if the yearly list and series wise list has the same set of matches

In [37]:
for match in final_match_list:
    if match not in Series_Wise_Match_List:
        print(match)

https://www.espncricinfo.com/series/pakistan-tour-of-england-2010-426350/australia-vs-pakistan-1st-test-426394/full-scorecard
https://www.espncricinfo.com/series/icc-world-test-championship-2021-2023-1268315/australia-vs-india-final-1358412/full-scorecard


In [38]:
for match in Series_Wise_Match_List:
    if match not in final_match_list:
        print(match)

https://www.espncricinfo.com/series/australia-tour-of-england-and-ireland-2010-426337/australia-vs-pakistan-1st-test-426394/full-scorecard


In [39]:
print(len(final_match_list))
print(len(Series_Wise_Match_List))

2579
2578
