To obtain match-level data, I first scraped search results from ESPNCricinfo using Python libraries `requests` and `BeautifulSoup`. The resulting JSON files were organized by match type and contained the ESPNCricinfo unique match id numbers.

In [None]:
import requests
import math
import time
import json
from bs4 import BeautifulSoup, UnicodeDammit

for match_type in ['list', 'first', 'odi', 'test', 't20i', 't20']:
    results = []
    r = requests.get('http://search.espncricinfo.com/ci/content/match/search.html?all=1;page=0;search=' + match_type)
    soup = BeautifulSoup(r.text, "html.parser")
    last_match = int(soup.find_all('span', attrs={'class':'PaginationNmbrs'})[-1].text)
    last_page = int(math.ceil(float(last_match)/float(20)))
    for i in range(0, last_page):
        time.sleep(1)
        results_page = requests.get("http://search.espncricinfo.com/ci/content/match/search.html?search={0};all=1;page={1}".format(match_type, i))
        soupy = BeautifulSoup(results_page.text, "html.parser")
        for new_host in soupy.find_all('a', {'class' : 'srchPlyrNmTxt'}):
            try:
                new_host = UnicodeDammit(new_host['href']).unicode_markup
            except:
                continue
            print(new_host.split("/")[4].split('.')[0])
            results.append(new_host.split("/")[4].split('.')[0])

    if match_type == 'list':
        file_name = 'list-a'
    elif match_type == 'first':
        file_name = 'first-class'
    else:
        file_name = match_type

    with open("matches-{0}.json".format(file_name), "w") as f:
        json.dump(results, f)

Next, I combined the match-type specific files into a single JSON file. I used a purpose-built Python library (https://github.com/dwillis/python-espncricinfo) to retrieve data on the matches. The script ensured that a match ID was only retrieved once. If a match was not yet completed, did not have any JSON data or was missing a scorecard it was excluded from the results, formatted as a comma-separated values file.

In [None]:
import time
import csv
import json
from espncricinfo.match import Match
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError

headers = ["team1", "team1_id", "team2", "team2_id", "win_toss", "bat_or_bowl", "outcome", "win_game", "date", "day_n_night", "ground", "rain", "duckworth_lewis", "match_id", "type_of_match", "match_type_id", "home_team_id", "umpire_1_id", "umpire_1_name", "umpire_1_country", "umpire_2_id", "umpire_2_name", "umpire_2_country", "tv_umpire_id", "tv_umpire_name", "tv_umpire_country", "referree_id", "referee_name", "referee_country", "url"]

matches = json.loads(open('all_matches.json').read())
matches = list(set(matches)) # dedupe

bad_matches = []

##################################START PROCESSING DATA#########################################

with open("final_output.csv", "r") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    for match in matches:
        if match in already_loaded:
            continue
        print(match)
        time.sleep(0.5)
        try:
            m = Match(int(match))
            if m.match_json()['match_status'] == 'forthcoming':
                continue
            if m.result == '':
                continue
            if m.rain_rule == 'D/L method':
                duckworth_lewis = 1
            else:
                duckworth_lewis = 0
            try:
                m.team_2['team_name']
            except KeyError:
                continue
            if m.match_class == '':
                if 'T20' in m.comms_json['props']['pageProps']['data']['content']['about']['series']['text']:
                    type_of_match = 'T20'
                elif 'ODI' in m.comms_json['props']['pageProps']['data']['content']['about']['series']['text']:
                    type_of_match = 'ODI'
                else:
                    type_of_match = None
            else:
                type_of_match = m.match_class
            try:
                m.team_1['team_name']
            except KeyError:
                continue
            if m.match_json()['international_class_card'] != "":
                match_type_id = m.match_json()['international_class_id']
            else:
                match_type_id = m.match_json()['general_class_id']
            try:
                if len(m.officials) > 0:
                    umpires = [o for o in m.officials if o['player_type_name'] == 'umpire']
                    tv_ump = [o for o in m.officials if o['player_type_name'] == 'tv umpire']
                    match_ref = [o for o in m.officials if o['player_type_name'] == 'referee']
                    if len(umpires) == 2:
                        ump_1 = umpires[0]
                        ump_2 = umpires[1]
                    elif len(umpires) == 0:
                        ump_1 = {'object_id': None, 'known_as': None, 'team_name': None}
                        ump_2 = {'object_id': None, 'known_as': None, 'team_name': None}
                    else:
                        ump_1 = umpires[0]
                        ump_2 = {'object_id': None, 'known_as': None, 'team_name': None}
                else:
                    ump_1 = {'object_id': None, 'known_as': None, 'team_name': None}
                    ump_2 = {'object_id': None, 'known_as': None, 'team_name': None}
            except ValueError:
                raise
            if len(tv_ump) > 0:
                tvu_id = tv_ump[0]['object_id']
                tvu_name = tv_ump[0]['known_as']
                tvu_country = tv_ump[0]['team_name']
            else:
                tvu_id = None
                tvu_name = None
                tvu_country = None
            if len(match_ref) > 0:
                mr_id = match_ref[0]['object_id']
                mr_name = match_ref[0]['known_as']
                mr_country = match_ref[0]['team_name']
            else:
                mr_id = None
                mr_name = None
                mr_country = None
            writer.writerow([m.team_1['team_name'], m.team_1_id, m.team_2['team_name'], m.team_2_id, m.toss_winner_team_id, m.toss_decision, m.result, m.match_json()['winner_team_id'], m.date, m.lighting, m.ground_name, None, duckworth_lewis, match, type_of_match, match_type_id, m.match_json()['home_team_id'], ump_1['object_id'], ump_1['known_as'], ump_1['team_name'], ump_2['object_id'], ump_2['known_as'], ump_2['team_name'], tvu_id, tvu_name, tvu_country, mr_id, mr_name, mr_country, m.match_url])
        except (json.JSONDecodeError, NoScorecardError, MatchNotFoundError, KeyError):
            bad_matches.append(match)
            continue

##################################FINISHED#########################################
print("The following matches could not be parsed: ")
print(bad_matches)
print("DONE.")

Similarly, for ICC team rankings I used a Python scraper to obtain monthly rankings from archived pages on the ICC site. More recent rankings were obtained from a JSON API published by the ICC.

In [None]:
import csv
import requests
from BeautifulSoup import BeautifulSoup

MONTH_MAPPING = {
    'JANUARY': 1,
    'FEBRUARY': 2,
    'MARCH': 3,
    'APRIL': 4,
    'MAY': 5,
    'JUNE': 6,
    'JULY': 7,
    'AUGUST': 8,
    'SEPTEMBER': 9,
    'OCTOBER': 10,
    'NOVEMBER': 11,
    'DECEMBER': 12
}

FORMAT_RANGES = {
    'test': range(1952, 2014),
    'odi': range(1981, 2014)
}

def parser(format):
    with open("../data/rankings_%s.csv" % format, "wb") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['format', 'year', 'month', 'rank', 'country', 'rating'])
        for year in FORMAT_RANGES[format]:
            rankings = fetch_rankings(year, format)
            for ranking in rankings:
                writer.writerow(ranking)

def fetch_rankings(year, format):
    results = []
    url = "http://web.archive.org/web/20130320093711/http://www.icc-cricket.com/match_zone/%s_ranking.php?year=%s" % (format, year)
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    month_links = soup.findAll('a', attrs={'style':'color:#666666;'})
    months = [MONTH_MAPPING[m.text] for m in month_links]
    tables = soup.findAll('table', attrs={'class':'dataBox topMargin'})
    combo = zip(months, tables)
    for month, table in combo:
        for row in table.findAll('tr')[1:-1]:
            rank, country, rating = [td.text for td in row.findAll('td')]
            results.append([format, year, month, int(rank), country, int(rating)])
    return results

def fetch_current_rankings(format):
    """
    format is one of: 'test', 'odi' or 't20i'
    """
    r = requests.get("http://www.icc-cricket.com/api/getRankings")
    rankings_json = r.json()
    return [r['rankings'] for r in rankings_json if r['matchType'] == format.upper()][0]