In [None]:
%load_ext sql
%sql sqlite://///home/mikew/extracover/male_t20.db

In [None]:
%config SqlMagic.displaylimit=20

## Run distribution by phase of innings

In [None]:
qry = """
WITH phased AS (
    SELECT
        *
    ,   CASE
        WHEN over BETWEEN 0 AND 5 THEN '0 powerplay'
        WHEN over BETWEEN 6 AND 17 THEN '1 mid-innings'
        ELSE '2 last two'
        END as phase
    FROM balls
)
SELECT phase, batter_runs, COUNT(*) AS count
FROM phased
WHERE innings < 2
AND extra_type = ''
AND batter_runs in (0,1,2,3,4,6)
GROUP BY phase, batter_runs
"""
result = %sql {{qry}}
df=result.DataFrame()
df.pivot(columns="phase", index="batter_runs", values="count")

## Average runs by over

In [None]:
sql= """
WITH over_runs AS (
SELECT
	over
,	CAST(SUM(batter_runs) + SUM(extra_runs) AS FLOAT) AS runs
FROM balls
WHERE innings = 0
GROUP BY match_id, over
HAVING MAX(ball) = 5
)
SELECT
	over
,	SUM(runs) / COUNT(*) AS avg_runs
FROM over_runs
GROUP BY over
ORDER BY over
"""

result=%sql {{sql}}
df=result.DataFrame()
df.plot(x="over", y="avg_runs", kind="bar")

## Batsman "Aggression"

First, what's the average *batter only* runs per ball in each over?

In [None]:
sql= """
WITH over_runs AS (
SELECT
	over
,	CAST(SUM(batter_runs) AS FLOAT) AS runs
FROM balls
WHERE innings = 0
GROUP BY match_id, over
HAVING MAX(ball) = 5
)
SELECT
	over
,	SUM(runs) / COUNT(*) / 6 AS avg_runs
FROM over_runs
GROUP BY over
ORDER BY over
"""

result=%sql {{sql}}
df=result.DataFrame()
df.plot(x="over", y="avg_runs", kind="bar")

Now, take a batsman - #137 CH Gayle has the most ball faced (and is, or was, notoriously aggressive)

In [None]:
sql = """SELECT
	over
,	COUNT(*)
,	SUM(CAST(batter_runs AS FLOAT)) / COUNT(*) AS avg_runs
FROM balls
WHERE batter = 137
GROUP BY over
ORDER BY over"""

result=%sql {{sql}}
df_gayle=result.DataFrame()
gayle = df_gayle["avg_runs"] - df["avg_runs"]
gayle.plot(kind="bar")
gayle.sum()

...or Virat Kohli (coincidentally #138 at time of computing)

In [None]:
sql = """
SELECT
	over
,	COUNT(*)
,	SUM(CAST(batter_runs AS FLOAT)) / COUNT(*) AS avg_runs
FROM balls
WHERE batter = 138
GROUP BY over
ORDER BY over"""

result=%sql {{sql}}
df_kohli=result.DataFrame()
kohli = df_kohli["avg_runs"] - df["avg_runs"]
kohli.plot(kind="bar")
kohli.sum()

Can we (usefully) fit some kind of curve (straight line) to that?

Or perhaps just smooth the by-over values somehow, perhaps by some weighted average of adjacent (one? more?) values?

Or even just apply the average of the differences to the averall runs-per-ball curve, bumping the probabilities for each score as necessary?

Also, how much do those figures vary over time, or match-by-match, I wonder? Are there discernible trends?

## Average First Innings Totals by City/Venue

In [None]:
sql = """
with first_inns_tots as (
	select b.match_id, m.city, m.venue, sum(b.batter_runs + b.extra_runs) as inns_tot
	from balls b join matches m on m.rowid = b.match_id
	where b.innings = 0
	and match_id in (
		select match_id from balls where innings = 0 group by match_id having count(*) >= 120
	)
	group by match_id
)
select city, venue, count(*), avg(inns_tot), min(inns_tot), max(inns_tot)
from first_inns_tots group by city, venue having count(*) >= 20 order by count(*) desc
"""
%sql {{sql}}

## Scoring shot frequencies by batsman & over

for players having faced  	enough deliveries to get usual distributions (first attempt: 1000 balls faced)


In [None]:
sql = """
WITH in_batsmen AS (
	SELECT
	  seq
	, batter AS batter_id
	, striker_name AS batter_name
	FROM all_balls
)
, qualifying_batters AS (
	SELECT
		batter_id
	,	batter_name
	FROM
		in_batsmen
	GROUP BY batter_id, batter_name
	HAVING Count(*) > 1000
)
, legit_balls AS (
	SELECT
	  over
	, striker_name
	, batter_id
	, batter_runs AS runs
	FROM all_balls
	JOIN qualifying_batters ON batter_id = batter
	WHERE LENGTH(extra_type) = 0
), batter_dists AS (
	SELECT
		striker_name
	,	batter_id
	, 	over
	,	(SUM(CASE WHEN runs = 0 THEN 1 END) + 0.0) / SUM(Count(*)) OVER (PARTITION BY striker_name, over) AS "0"
	,	(SUM(CASE WHEN runs = 1 THEN 1 END) + 0.0) / SUM(Count(*)) OVER (PARTITION BY striker_name, over) AS "1"
	,	(SUM(CASE WHEN runs = 2 THEN 1 END) + 0.0) / SUM(Count(*)) OVER (PARTITION BY striker_name, over) AS "2"
	,	(SUM(CASE WHEN runs = 3 THEN 1 END) + 0.0) / SUM(Count(*)) OVER (PARTITION BY striker_name, over) AS "3"
	,	(SUM(CASE WHEN runs = 4 THEN 1 END) + 0.0) / SUM(Count(*)) OVER (PARTITION BY striker_name, over) AS "4"
	,	(SUM(CASE WHEN runs = 5 THEN 1 END) + 0.0) / SUM(Count(*)) OVER (PARTITION BY striker_name, over) AS "5"
	,	(SUM(CASE WHEN runs = 6 THEN 1 END) + 0.0) / SUM(Count(*)) OVER (PARTITION BY striker_name, over) AS "6"
	FROM
		legit_balls
	GROUP BY striker_name, over, batter_id
)
SELECT *
FROM batter_dists
WHERE batter_id = 137
ORDER BY striker_name, batter_id, over
"""

result = %sql {{sql}}
df = result.DataFrame()


In [None]:
df.plot(x="over", y=["0", "1", "2", "4", "6"],kind="bar", figsize=(15,5))

