# Survival Analysis - test cricket

### Setup and loading data

In [1]:
import lifelines
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline


Load data from https://cricsheet.org/

In [2]:
files = os.listdir('test_csv')
files = [f for f in files if '.csv' in f]

In [None]:
cric_df = pd.DataFrame()
cric_cols = ['type','innings','ball','team','bat_1','bat_2','bowl','runs','byes','wicket','player_out']
for i, file in enumerate(files):
    df = pd.read_csv('test_csv/'+file, error_bad_lines=False,names=cric_cols)
    df['match_id'] = i
    cric_df=cric_df.append(df,sort=False)
    

For now, drop match metadata (note this could be used to generate features).

In [None]:

cric_df = cric_df[cric_df['type']=='ball']
cric_df.head()



### Calculating innings-level stats

Was the player out? For now ignoring runouts (maybe better anyway)?

In [None]:
cric_df['is_out']=(cric_df['player_out']==cric_df['bat_1'])


What is the first ball faced by the player? (so we can find their place in batting order..).

In [None]:
cric_df['first_ball'] = cric_df.index

Aggregate to innings-level granularity

In [None]:
innings_totals = cric_df.groupby(['team','match_id','innings','bat_1']).agg({'runs':'sum', 
                         'is_out':'sum', 
                         'first_ball':'min'})


What number did the player bat? Calculated by using rank of bat_number, by innings.

In [None]:
innings_totals['bat_number'] = innings_totals.groupby(['team','match_id','innings']).rank()['first_ball']

innings_totals.reset_index(inplace=True)
innings_totals['innings']= pd.to_numeric(innings_totals['innings'])

innings_totals.head()

 Simple histograms

In [None]:
innings_totals['runs'].hist()


In [None]:
innings_totals['is_out'].hist()

### KMF model

In [None]:
kmf = lifelines.KaplanMeierFitter()
kmf.fit(innings_totals['runs'], event_observed=innings_totals['is_out'], label="All players Kaplan Meier")

Plotting the output

In [None]:
kmf.survival_function_
kmf.median_
kmf.plot()

Comparing survival curve for different innings

In [None]:

ax = plt.subplot(111)

for i in range(1,5):
    data = innings_totals[innings_totals["innings"] == i]
    kmf.fit(data['runs'], event_observed=data['is_out'], label="Innings: {}".format(i))
    kmf.plot(ax=ax, ci_force_lines=True)
    
plt.ylim(0, 1);
plt.title("Survival for different innings");

And for different batting numbers

In [None]:
ax = plt.subplot(111)

for i in range(1,5):
    data = innings_totals[innings_totals["bat_number"] == i]
    kmf.fit(data['runs'], event_observed=data['is_out'], label="Batter: {}".format(i))
    kmf.plot(ax=ax, ci_force_lines=True)
    
plt.ylim(0, 1);
plt.title("Survival for batsmen 1-4 ");

In [None]:
ax = plt.subplot(111)

for i in range(5,9):
    data = innings_totals[innings_totals["bat_number"] == i]
    kmf.fit(data['runs'], event_observed=data['is_out'], label="Bat Number: {}".format(i))
    kmf.plot(ax=ax, ci_force_lines=True)
    
plt.ylim(0, 1);
plt.title("Survival for batsmen 5-8 ");

In [None]:
ax = plt.subplot(111)

for i in range(9,12):
    data = innings_totals[innings_totals["bat_number"] == i]
    kmf.fit(data['runs'], event_observed=data['is_out'], label="Bat Number: {}".format(i))
    kmf.plot(ax=ax, ci_force_lines=True)
    
plt.ylim(0, 1);
plt.title("Survival for batsmen 9-11 ");

### CPH model

Train model

In [None]:
cph = lifelines.CoxPHFitter()

cox_data = innings_totals[['innings','bat_number','runs','is_out']]

cox_data = pd.get_dummies(cox_data)
cph.fit(df=cox_data, duration_col='runs', event_col='is_out')


Evaluate outcomes

In [None]:
cph.plot()

In [None]:
cph.print_summary()

In [None]:
X = cox_data.drop(['runs', 'is_out'], axis=1)
d = {'innings': [1, 1,4], 'bat_number': [1, 3,10]}
test_cases=pd.DataFrame(data=d,index=['innings 1 - number 1','innings 2 - number - 3', 'innings 4 - number 10']);test_cases

In [None]:
Predict for specific outcomes / scores

In [None]:
cph.predict_survival_function(test_cases).plot()