Scraping Cricket Data from the Internet

In [None]:
import json
import os

from tqdm import tqdm
from bs4 import BeautifulSoup
import requests


class SearchBox:
    URL = "http://www.howstat.com/cricket/Statistics/Players/PlayerMenu.asp"

    def __init__(self) -> None:
        pass

    def search(self, name):
        response = requests.post(
            self.URL, {"txtPlayer": name, "txtAction": "FindPlayer"},
        )
        soup = BeautifulSoup(response.text, "html.parser")

        tbl = soup.find("table", {"class": "TableLined"})
        if tbl is None:
            return []
        else:
            results = tbl.find_all("a", {"class": "LinkNormal"})
            links = [r.get("href") for r in results]

            return links


with open("players.json", "r") as f:
    PLAYERS = json.load(f)

engine = SearchBox()

match_zero = {}
match_one = {}
match_many = {}

pbar = tqdm(PLAYERS)
for player in pbar:
    pbar.set_description(player)

    results = engine.search(player)

    if len(results) == 0:
        match_zero[player] = {}
    elif len(results) >= 2:
        match_many[player] = results
    else:
        playerid = int(results[0].split("=")[1])
        match_one[player] = playerid

print()
print("ZERO MATCHES:", len(match_zero))
print("ONE MATCH:", len(match_one))
print("MANY MATCHES:", len(match_many))
print()

os.makedirs("results", exist_ok=True)

with open("results/zero.json", "w") as f:
    json.dump(match_zero, f, indent=2)
with open("results/one.json", "w") as f:
    json.dump(match_one, f, indent=2)
with open("results/many.json", "w") as f:
    json.dump(match_many, f, indent=2)