# Scraping the performances of the top 100 test match run scorers from Cricinfo's Statsguru

As a way to practice my web scraping and D3 skills I've been meaning to do this for a while. There's no public API for Statsguru and it's not easy to grab a list of each player. The easiest way to get this is to scrape the summary table for batting and bowling then grab the player ID for each player in the top 100 from the links in the table. We can then use this ID to build the URL for the full performance list for each player and then use that to scrape the data and put it in a single file. Easy!

First, let's import `BeautifulSoup` and `requests` for scraping, and `pandas` for data munging....

In [44]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [110]:
summary_url = "http://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;orderby=runs;template=results;type=batting"

summary_r = requests.get(summary_url)
summary_soup = BeautifulSoup(summary_r.content, 'html5lib')
summary_table = summary_soup.find_all("table", class_="engineTable")[2].tbody

In [85]:
player_list = []

for row in summary_table.find_all('tr'):
    link = row.find_all('td')[0].find('a')
    
    player_name = link.contents[0]
    player_id = re.search('player\/(.+?)\.html', link['href']).group(1)
    
    temp_details = {}
    temp_details['name'] = player_name
    temp_details['id'] = player_id
    
    player_list.append(temp_details)

player_list

[{'id': '35320', 'name': 'SR Tendulkar'},
 {'id': '7133', 'name': 'RT Ponting'},
 {'id': '45789', 'name': 'JH Kallis'},
 {'id': '28114', 'name': 'R Dravid'},
 {'id': '50710', 'name': 'KC Sangakkara'},
 {'id': '52337', 'name': 'BC Lara'},
 {'id': '51469', 'name': 'S Chanderpaul'},
 {'id': '49289', 'name': 'DPMD Jayawardene'},
 {'id': '4174', 'name': 'AR Border'},
 {'id': '8192', 'name': 'SR Waugh'},
 {'id': '28794', 'name': 'SM Gavaskar'},
 {'id': '11728', 'name': 'AN Cook'},
 {'id': '47270', 'name': 'GC Smith'},
 {'id': '43652', 'name': 'Younis Khan'},
 {'id': '13399', 'name': 'GA Gooch'},
 {'id': '40879', 'name': 'Javed Miandad'},
 {'id': '40570', 'name': 'Inzamam-ul-Haq'},
 {'id': '30750', 'name': 'VVS Laxman'},
 {'id': '4578', 'name': 'MJ Clarke'},
 {'id': '5616', 'name': 'ML Hayden'},
 {'id': '35263', 'name': 'V Sehwag'},
 {'id': '52812', 'name': 'IVA Richards'},
 {'id': '20372', 'name': 'AJ Stewart'},
 {'id': '13418', 'name': 'DI Gower'},
 {'id': '19296', 'name': 'KP Pietersen'},


In [86]:
player_id = player_list[0]['id']

innings_list_url = "http://stats.espncricinfo.com/ci/engine/player/" + player_id + ".html?class=1;orderby=runs;template=results;type=batting;view=innings"

player_r = requests.get(innings_list_url)
player_soup = BeautifulSoup(player_r.content, 'html5lib')
player_table = player_soup.find_all("table", class_="engineTable")[3].tbody
player_table

<tbody>
 <tr class="data1">
  <td class="padAst">15</td>
  <td>28</td>
  <td>24</td>
  <td>2</td>
  <td>0</td>
  <td>62.50</td>
  <td>6</td>
  <td>bowled</td>
  <td>2</td>
  <td></td>
  <td class="left" nowrap="nowrap">v <a class="data-link" href="/ci/content/team/7.html">Pakistan</a></td>
  <td class="left" nowrap="nowrap"><a class="data-link" href="/ci/content/ground/58956.html">Karachi</a></td>
  <td nowrap="nowrap"><b>15 Nov 1989</b></td>
  <td style="white-space: nowrap;"><a href="/ci/engine/match/63513.html" title="view the scorecard for this row">Test # 1127</a></td>
 </tr>
 <tr class="data1">
  <td class="padAst">DNB</td>
  <td nowrap="nowrap">-</td>
  <td nowrap="nowrap">-</td>
  <td nowrap="nowrap">-</td>
  <td nowrap="nowrap">-</td>
  <td class="padDp2" nowrap="nowrap">-</td>
  <td nowrap="nowrap">-</td>
  <td nowrap="nowrap">-</td>
  <td>4</td>
  <td></td>
  <td class="left" nowrap="nowrap">v <a class="data-link" href="/ci/content/team/7.html">Pakistan</a></td>
  <td class=

In [108]:
innings_list = []
cumulative_runs = 0

for row in player_table.findAll('tr'):
    cells = row.find_all('td')

    if cells[0].string != 'DNB' and cells[0].string != 'TDNB':
        innings = {}
        cumulative_runs += int(re.sub('\*', '', cells[0].string))
        
        innings['cum_runs'] = cumulative_runs
        innings['runs'] = int(re.sub('\*', '', cells[0].string))
        innings['mins'] = int(cells[1].string)
        innings['balls'] = int(cells[2].string)
        innings['fours'] = int(cells[3].string)
        innings['sixes'] = int(cells[4].string)
        innings['strike_rate'] = cells[5].string
        innings['position'] = cells[6].string
        innings['how_out'] = cells[7].string
        innings['innings'] = cells[8].string
        innings['opponent'] = cells[10].find('a').contents[0]
        innings['venue'] = cells[11].find('a').contents[0]
        innings['date'] = cells[12].string

        innings_list.append(innings)

innings_list

[{'balls': '24',
  'cum_runs': 15,
  'date': '15 Nov 1989',
  'fours': '2',
  'how_out': 'bowled',
  'innings': '2',
  'mins': '28',
  'opponent': 'Pakistan',
  'position': '6',
  'runs': '15',
  'sixes': '0',
  'strike_rate': '62.50',
  'venue': 'Karachi'},
 {'balls': '172',
  'cum_runs': 74,
  'date': '23 Nov 1989',
  'fours': '4',
  'how_out': 'lbw',
  'innings': '1',
  'mins': '254',
  'opponent': 'Pakistan',
  'position': '6',
  'runs': '59',
  'sixes': '0',
  'strike_rate': '34.30',
  'venue': 'Faisalabad'},
 {'balls': '16',
  'cum_runs': 82,
  'date': '23 Nov 1989',
  'fours': '1',
  'how_out': 'run out',
  'innings': '3',
  'mins': '24',
  'opponent': 'Pakistan',
  'position': '6',
  'runs': '8',
  'sixes': '0',
  'strike_rate': '50.00',
  'venue': 'Faisalabad'},
 {'balls': '90',
  'cum_runs': 123,
  'date': '1 Dec 1989',
  'fours': '5',
  'how_out': 'bowled',
  'innings': '1',
  'mins': '124',
  'opponent': 'Pakistan',
  'position': '7',
  'runs': '41',
  'sixes': '0',
  'stri

In [118]:
name_team = summary_table.find_all('tr')[0].find_all('td')[0].get_text()
re.search('\((.+?)\)', name_team).group(1)

'India'