In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression

In [2]:
def download(first_page, last_page):
    dfs = [] 
    for i in range(first_page, last_page):
        url = f'http://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;orderby=start;page={i};template=results' \
              f';type=batting;view=innings;wrappertype=print'
        page = requests.get(url)
        html_table = BeautifulSoup(page.text, 'html.parser').find_all('table', class_='engineTable')[2]
        df = pd.read_html(str(html_table), header=0)[0]
        df = df.assign(Page = i)
        dfs.append(df)
    output = pd.concat(dfs, axis = 0)
    return(output)

In [3]:
dl = download(1, 3)
dl

Unnamed: 0,Player,Runs,Mins,BF,4s,6s,SR,Inns,Unnamed: 8,Opposition,Ground,Start Date,Page
0,C Bannerman (AUS),165*,285,-,18,0,-,1,,Test v England,Melbourne,15 Mar 1877,1
1,NFD Thomson (AUS),1,-,-,0,0,-,1,,Test v England,Melbourne,15 Mar 1877,1
2,TP Horan (AUS),12,-,-,-,0,-,1,,Test v England,Melbourne,15 Mar 1877,1
3,DW Gregory (AUS),1,-,-,0,0,-,1,,Test v England,Melbourne,15 Mar 1877,1
4,BB Cooper (AUS),15,-,-,-,0,-,1,,Test v England,Melbourne,15 Mar 1877,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,T Emmett (ENG),0,1,1,0,0,0.00,1,,Test v Australia,Melbourne,2 Jan 1879,2
46,CA Absolom (ENG),52,-,-,5,0,-,1,,Test v Australia,Melbourne,2 Jan 1879,2
47,L Hone (ENG),7,-,-,-,0,-,1,,Test v Australia,Melbourne,2 Jan 1879,2
48,SS Schultz (ENG),0*,-,-,0,0,-,1,,Test v Australia,Melbourne,2 Jan 1879,2


In [4]:
df = dl \
    .assign(
        mt_start_date = lambda x: x['Start Date'].astype('datetime64[ns]'),
        mt_ground = lambda x: x['Ground'],
        mt_format = lambda x: x['Opposition'].str.extract(r'^(.*?)\sv'),
        mt_team_code = lambda x: x['Player'].str.extract(r'(?<=\()(.+?)(?=\))'),
        mt_opposition = lambda x: x['Opposition'].str.extract(r'v\s(.*)'),
        in_no = lambda x: x['Inns'].astype('int'),
        pl_name = lambda x: x['Player'].str.extract(r'^(.*?)\s\('),
        pf_runs = lambda x: np.where(x['Runs'] == 'DNB', 0, x['Runs'].str.replace(r'\*', '')).astype('int'),
        pf_balls = lambda x: np.where(x['Runs'] == 'DNB', 0, np.where(x['BF'] == '-', np.nan, x['BF'])).astype('float'),
        pf_mins = lambda x: np.where(x['Runs'] == 'DNB', 0, np.where(x['Mins'] == '-', np.nan, x['Mins'])).astype('float'),
        pf_fours = lambda x: np.where(x['Runs'] == 'DNB', 0, np.where(x['4s'] == '-', np.nan, x['4s'])).astype('float'),
        pf_sixes = lambda x: np.where(x['Runs'] == 'DNB', 0, np.where(x['6s'] == '-', np.nan, x['6s'])).astype('float'),
        pf_no = lambda x: np.where(x['Runs'].str.contains('*', regex=False), True, False).astype('bool'),
        pf_dnb = lambda x: np.where(x['Runs'] == 'DNB', True, False).astype('bool'),
        pf_highest_batsman = lambda x: x.groupby(['mt_start_date', 'mt_team_code', 'in_no'])['pf_runs'].transform('max') == x['pf_runs'],
        pl_batting_order = lambda x: x.groupby(['mt_start_date', 'mt_team_code', 'in_no']).cumcount() + 1,
        pl_inning_no = lambda x: x.sort_values(['mt_start_date', 'in_no']).groupby(['pl_name']).cumcount() + 1
    ) \
    .filter(like = '_') \
    .reset_index(drop = True) \
    .rename_axis('in_id')

df

Unnamed: 0_level_0,mt_start_date,mt_ground,mt_format,mt_team_code,mt_opposition,in_no,pl_name,pf_runs,pf_balls,pf_mins,pf_fours,pf_sixes,pf_no,pf_dnb,pf_highest_batsman,pl_batting_order,pl_inning_no
in_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1877-03-15,Melbourne,Test,AUS,England,1,C Bannerman,165,,285.0,18.0,0.0,True,False,True,1,1
1,1877-03-15,Melbourne,Test,AUS,England,1,NFD Thomson,1,,,0.0,0.0,False,False,False,2,1
2,1877-03-15,Melbourne,Test,AUS,England,1,TP Horan,12,,,,0.0,False,False,False,3,1
3,1877-03-15,Melbourne,Test,AUS,England,1,DW Gregory,1,,,0.0,0.0,False,False,False,4,1
4,1877-03-15,Melbourne,Test,AUS,England,1,BB Cooper,15,,,,0.0,False,False,False,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1879-01-02,Melbourne,Test,ENG,Australia,1,T Emmett,0,1.0,1.0,0.0,0.0,False,False,False,8,5
96,1879-01-02,Melbourne,Test,ENG,Australia,1,CA Absolom,52,,,5.0,0.0,False,False,True,9,1
97,1879-01-02,Melbourne,Test,ENG,Australia,1,L Hone,7,,,,0.0,False,False,False,10,1
98,1879-01-02,Melbourne,Test,ENG,Australia,1,SS Schultz,0,,,0.0,0.0,True,False,False,11,1


# Logistic Regression

In [5]:
model = LogisticRegression(solver = 'lbfgs')

In [6]:
model.fit(df['pl_batting_order'].values.reshape(-1, 1), df['pf_highest_batsman'].values.astype('int'))

LogisticRegression()

In [7]:
df = df
df['pr_highest_batsman'] = model.predict_proba(df['pl_batting_order'].values.reshape(-1, 1))[:, 1]
df

Unnamed: 0_level_0,mt_start_date,mt_ground,mt_format,mt_team_code,mt_opposition,in_no,pl_name,pf_runs,pf_balls,pf_mins,pf_fours,pf_sixes,pf_no,pf_dnb,pf_highest_batsman,pl_batting_order,pl_inning_no,pr_highest_batsman
in_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1877-03-15,Melbourne,Test,AUS,England,1,C Bannerman,165,,285.0,18.0,0.0,True,False,True,1,1,0.229282
1,1877-03-15,Melbourne,Test,AUS,England,1,NFD Thomson,1,,,0.0,0.0,False,False,False,2,1,0.188657
2,1877-03-15,Melbourne,Test,AUS,England,1,TP Horan,12,,,,0.0,False,False,False,3,1,0.153793
3,1877-03-15,Melbourne,Test,AUS,England,1,DW Gregory,1,,,0.0,0.0,False,False,False,4,1,0.124385
4,1877-03-15,Melbourne,Test,AUS,England,1,BB Cooper,15,,,,0.0,False,False,False,5,1,0.099936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1879-01-02,Melbourne,Test,ENG,Australia,1,T Emmett,0,1.0,1.0,0.0,0.0,False,False,False,8,5,0.050349
96,1879-01-02,Melbourne,Test,ENG,Australia,1,CA Absolom,52,,,5.0,0.0,False,False,True,9,1,0.039791
97,1879-01-02,Melbourne,Test,ENG,Australia,1,L Hone,7,,,,0.0,False,False,False,10,1,0.031374
98,1879-01-02,Melbourne,Test,ENG,Australia,1,SS Schultz,0,,,0.0,0.0,True,False,False,11,1,0.024691


In [8]:
model.score(df['pl_batting_order'].values.reshape(-1, 1), df['pf_highest_batsman'].values)

0.9

# RNN