# Understand the data
https://thecrickettest-peterbarraud.notebooks.azure.com/j/notebooks/understanding-batters-data.ipynb

# base data
get the base data from the **ESPNCricinfo** `CSV`
Just some simple *heading renaming*
And *dtype settings*
Also, we're replacing all `-` with `NaN`

In [117]:
import pandas as pd
import numpy as np
col_names = ['playername','runswithasterisk','runs','mins','batted','notout','ballsfaced','fours','sixes', 'strikerate','innings',
             'opposition','ground','date','country','fifties','hundreds', 'runbuckets','numoversbowled','oversbowled','maidens','runsconceded',
             'wickets','fourwickets', 'fivewickets','tenwickets','wicketbuckets','economyrate']

use_cols = ['playername', 'runs','mins', 'notout','ballsfaced','fours','sixes', 'innings','opposition','ground','date','country',
           'numoversbowled', 'maidens','runsconceded', 'wickets']

col_types = {'playername': np.object, 'runs': np.float16, 'mins': np.float16, 'notout': np.float16, 'ballsfaced': np.float16, 'fours': np.float16,
             'sixes': np.float16, 'innings': np.float16, 'opposition': np.object, 'ground': np.object, 'date': np.object, 'country': np.object,
            'numoversbowled': np.object,'maidens': np.float16,'runsconceded': np.float16, 'wickets': np.float16}


df = pd.read_csv('./data/Test Player Innings Stats - All Teams.csv', names=col_names, skiprows=1, usecols=use_cols, dtype=col_types, na_values='-')


# date column

In [118]:
df['date'] = pd.to_datetime(df.date)

# opposition column
the file contains opposition as `v <Country name>`. A little odd, so let's remove the `v ` from the front


In [119]:
df['opposition'] = df.opposition.str.lstrip("v ")

# Home game
Nice to have the info if the row was a home or away game and we're going to use a bool to say `homegame` or `not`
Simple rule if
df.country == ground_country then homegame = 1
else homegame = 0

In [120]:
# a dict ground countries
ground_countries = {'The Oval': 'England', 'Auckland': 'New Zealand', "Lord's": 'England', 'Kingston': 'West Indies', 'Leeds': 'England', 'Birmingham': 'England',
                    'Sydney': 'Australia', 'Nottingham': 'England', 'Abu Dhabi': 'Pakistan', 'Cape Town': 'South Africa', 'Manchester': 'England',
                    'Melbourne': 'Australia', 'Durban': 'South Africa', 'Brisbane': 'Australia', 'Christchurch': 'New Zealand', 'Adelaide': 'Australia',
                    'Chennai': 'India', 'Port of Spain': 'West Indies', 'Karachi': 'Pakistan', 'Cardiff': 'England', 'Johannesburg': 'South Africa',
                    'Multan': 'Pakistan', 'Kolkata': 'India', 'Mumbai': 'India', "St George's": 'West Indies', 'Delhi': 'India', 'Napier': 'New Zealand',
                    'Ahmedabad': 'India', 'Mumbai (BS)': 'India', "St John's": 'West Indies', 'Lahore': 'Pakistan', 'Chattogram': 'Bangladesh', 'Kanpur': 'India',
                    'Georgetown': 'West Indies', 'Southampton': 'England', 'Dhaka': 'Bangladesh', 'Wellington': 'New Zealand', 'Chester-le-Street': 'England',
                    'Perth': 'Australia', 'Hyderabad (Sind)': 'Pakistan', 'Bridgetown': 'West Indies', 'Faisalabad': 'Pakistan', 'Colombo (PSS)': 'Sri Lanka',
                    'Galle': 'Sri Lanka', 'Mohali': 'India', 'Nagpur': 'India', 'North Sound': 'West Indies', 'Centurion': 'South Africa',
                    'Port Elizabeth': 'South Africa', 'Rajkot': 'Pakistan', 'Colombo (SSC)': 'Sri Lanka', 'Pallekele': 'Sri Lanka', 'Gros Islet': 'West Indies',
                    'Dunedin': 'New Zealand', 'Bulawayo': 'Zimbabwe', 'Kandy': 'Sri Lanka', 'Harare': 'Sri Lanka', 'Dubai (DSC)': 'Pakistan', 'Bengaluru': 'India',
                    'Sharjah': 'Pakistan', 'Visakhapatnam': 'India', 'Hamilton': 'New Zealand', 'Sheffield': 'England', 'Peshawar': 'Pakistan', 'Hobart': 'Australia',
                    'Canberra': 'Australia', 'Ranchi': 'India', 'Cairns': 'Australia', 'Rawalpindi': 'Pakistan', 'Fatullah': 'Bangladesh', 'Roseau': 'West Indies',
                    'Dharamsala': 'India', 'Darwin': 'Australia', 'Pune': 'India', 'Moratuwa': 'Sri Lanka', 'Colombo (RPS)': 'Sri Lanka',
                    'Hyderabad (Deccan)': 'India', 'East London': 'South Africa', 'Potchefstroom': 'South Africa', 'Bloemfontein': 'South Africa',
                    'Basseterre': 'West Indies', 'Sheikhupura': 'Pakistan', 'Khulna': 'Bangladesh', 'Kingstown': 'West Indies', 'Providence': 'West Indies',
                    'Colombo (CCC)': 'Sri Lanka', 'Indore': 'India', 'Cuttack': 'India', 'Jalandhar': 'India', 'Lucknow': 'India', 'Jaipur': 'India',
                    'Sialkot': 'Pakistan', 'Chandigarh': 'India', 'Bahawalpur': 'Pakistan', 'Dublin (Malahide)': 'Ireland', 'Gujranwala': 'Pakistan',
                    'Bogra': 'Bangladesh', 'Sylhet': 'Bangladesh', 'Dehradun': 'India'}
        
df['homegame'] = df.apply(lambda x: ground_countries[x.ground] == x.country, axis=1) 

# Batter data

## only batter rows

In [121]:
# remove all batter rows for runs = NaN
dfb = df[df.runs.isna() == False]

## only batter coluimns

In [122]:
dfb = dfb[['playername', 'runs', 'mins', 'notout', 'ballsfaced', 'fours', 'sixes', 'innings', 'opposition', 'date', 'country', 'homegame']]

In [123]:
## Clean up batter col dtypes

In [124]:
# since we have no NaN values for notout, we bool_
dfb.notout = dfb.notout.astype(np.bool_)
# since we have no NaN values for runs and innings, we can use int (instead of float)
dfb.runs = dfb.runs.astype(np.int16)
dfb.innings = dfb.innings.astype(np.int8)

# Save batter pickle

In [125]:
dfb.to_pickle('./data/batters.pkl')