In [14]:
from bs4 import BeautifulSoup
import requests
from itertools import chain
import pandas as pd
from tqdm import tqdm_notebook

In [15]:
tournament_ids = {
    "1987": 617,
    "1992": 665,
    "1996":722,
    "1999":787,
    "2003":865,
    "2007":2403,
    "2011":4857,
    "2015":6537,
    "2019":12357
}

In [49]:
stat_type = [
    {"key":"batting", "value": "most_runs_career", "filename": "most_runs_season_batting"},
    {"key": "bowling", "value": "most_wickets_career", "filename": "most_wickets_season_bowling"},
    {"key": "keeping", "value":"most_dismissals_career", "filename": "most_dismissals_season_keeping"},
    {"key": "fielding", "value": "most_catches_career", "filename": "most_catches_season"},
    {"key": "averages", "value": "batting", "filename": "batting_season_stats"},
    {"key": "averages", "value": "bowling", "filename": "bowling_season_stats"}
]

In [50]:
def create_link(stat_type, t_id):
    return f"https://stats.espncricinfo.com/ci/engine/records/{stat_type['key']}/{stat_type['value']}.html?id={tournament_ids[t_id]};type=tournament"

In [51]:
create_link(stat_type[0], "1987")

'https://stats.espncricinfo.com/ci/engine/records/batting/most_runs_career.html?id=617;type=tournament'

In [57]:
country_mapping = {
    "CAN": "Canada",
    "ENG": "England",
    "INDIA": "India",
    "SL": "SriLanka",
    "AUS":"Australia",
    "SA": "SouthAfrica",
    "PAK": "Pakistan",
    "BDESH": "Bangladesh",
    "WI": "WestIndies",
    "NZ": "NewZealand",
    "ZIM": "Zimbabwe",
    "KENYA": "Kenya",
    "IRE": "Ireland",
    "SCOT": "Scotland",
    "NL": "Netherlands",
    "UAE": "UnitedArabEmirates",
    "NAM": "Namibia",
    "AFG": "Afghanistan",
    "BMUDA": "Bermuda"
}

In [53]:
def scrape_page(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find_all("table", {"class":"engineTable"})[0]
    columns = [col.text.strip() for col in soup.find("tr", {"class": "head"}).find_all("th")]
    all_rows = []
    for row in table.find_all("tr")[1:]:
        elements = [el.text for el in row.find_all("td")]
        all_rows.append(elements)
    df = pd.DataFrame(all_rows, columns=columns)
    df["country"] = df["Player"].apply(lambda x: country_mapping[x.split("(")[1].split(")")[0]])
    df["Player"] = df["Player"].apply(lambda x: x.split("(")[0].strip(" "))
    return df

In [59]:
def scrape_by_stat_type(stat_type):
    all_df = []
    for year in tqdm_notebook(tournament_ids.keys()):
        link = create_link(stat_type, year)
        df = scrape_page(link)
        df["year"] = year
        all_df.append(df)
    stat_df = pd.concat(all_df)
    stat_df.to_csv(f"./worldcup_data/{stat_type['filename']}.csv", index=False)

In [60]:
def scrape_all_stats():
    for stat in tqdm_notebook(stat_type):
        scrape_by_stat_type(stat)

In [61]:
scrape_all_stats()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until



















