<a target="_blank" href="https://colab.research.google.com/github/nascarsayan/diy-python/blob/master/cricketData.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

### ODI Batting analysis

- We read the data from the csv filepath `datasets/cricketData/Batting/ODI.csv`
- If the file is not present:
    - we create the necessary directories
    - we download the file from the url to the filepath


In [21]:
import os
fpath = os.path.join(
    "datasets",
    "cricketData",
    "Batting",
    "ODI.csv"
)

# if fpath does not exist download it 
# and save it to the above path

url = "https://github.com/nascarsayan/diy-python/raw/master/datasets/cricketData/Batting/ODI.csv"
if not os.path.exists(fpath):

    # Create the nessary directories if they don't exist
    os.makedirs(os.path.dirname(fpath), exist_ok=True)
    
    # This is how we can make an HTTP GET request
    # and save the response to a file
    import urllib.request
    urllib.request.urlretrieve(url, fpath)

- `with` is used to automatically close the file when the block is exited
- `open` is a built-in function that opens a file and returns a file object
- `readlines` is a method that reads all the lines of a file into memory and returns a list of strings

In [22]:
with open(fpath) as f:
    content = f.readlines()
print(content[:30])

[',Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13\n', '0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20,\n', '1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15,\n', '2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,\n', '3,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34,\n', '4,DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28,\n', '5,Inzamam-ul-Haq (Asia/PAK),1991-2007,378,350,53,11739,137*,39.52,15812,74.24,10,83,20,\n', '6,V Kohli (INDIA),2008-2019,242,233,39,11609,183,59.84,12445,93.28,43,55,13,\n', '7,JH Kallis (Afr/ICC/SA),1996-2014,328,314,53,11579,139,44.36,15885,72.89,17,86,17,\n', '8,SC Ganguly (Asia/INDIA),1992-2007,311,300,23,11363,183,41.02,15416,73.7,22,72,16,\n', '9,R Dravid (Asia/ICC/INDIA),1996-2011,344,318,40,10889,153,39.16,15284,71.24,12,83,13,\n', '10,MS Dhoni (Asia/INDIA),2004-201

In [23]:
import csv
import json
with open(fpath) as f:
    reader = csv.DictReader(f)
    data = [row for row in reader]
print(json.dumps(data[:1], indent=2))


[
  {
    "": "0",
    "Player": "SR Tendulkar (INDIA)",
    "Span": "1989-2012",
    "Mat": "463",
    "Inns": "452",
    "NO": "41",
    "Runs": "18426",
    "HS": "200*",
    "Ave": "44.83",
    "BF": "21367",
    "SR": "86.23",
    "100": "49",
    "50": "96",
    "0": "20",
    "Unnamed: 13": ""
  }
]


In [24]:
# Schema of the data
"""
{
    "": "0",
    "Player": "SR Tendulkar (INDIA)",
    "Span": "1989-2012",
    "Mat": "463",
    "Inns": "452",
    "NO": "41",
    "Runs": "18426",
    "HS": "200*",
    "Ave": "44.83",
    "BF": "21367",
    "SR": "86.23",
    "100": "49",
    "50": "96",
    "0": "20",
    "Unnamed: 13": ""
}
"""

'\n{\n    "": "0",\n    "Player": "SR Tendulkar (INDIA)",\n    "Span": "1989-2012",\n    "Mat": "463",\n    "Inns": "452",\n    "NO": "41",\n    "Runs": "18426",\n    "HS": "200*",\n    "Ave": "44.83",\n    "BF": "21367",\n    "SR": "86.23",\n    "100": "49",\n    "50": "96",\n    "0": "20",\n    "Unnamed: 13": ""\n}\n'

In [None]:
# Cleanup the data


In [25]:
# Count the number of players
players = set()
for row in data:
    players.add(row["Player"])
print(len(players))

2499


In [26]:
# Convert to dataframe
import pandas as pd
df = pd.DataFrame(data)

In [28]:
df[:3]

Unnamed: 0,Unnamed: 1,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13
0,0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20,
1,1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15,
2,2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,


In [29]:
# Get the player with the highest number of runs
df["Runs"] = df["Runs"].astype(int)
df["Runs"].idxmax()

ValueError: invalid literal for int() with base 10: '-'