In [1]:
import pandas as pd
import glob
import json

#get all the json files in the folder
data_files = glob.glob('/Users/ollie/Documents/Job Stuff/Hawkeye Project/gloucestershire_male_json/*.json')
                                       
i = 1

games = []

for data in data_files:                                          #loop through the files

    with open(data)as f:                                         #open the file
        data = json.load(f)
    game = i
    result = data["info"]["outcome"]                             #get the result of the match

    innings_data = []                                            #create an empty list to store the data

    for inning in data["innings"]:                               #loop through the innnigs (1 & 2)
        team = inning["team"]                                    #get the team name
        for over in inning["overs"]:                             #loop through the overs
            over_number = over["over"]                           #track the over number
            for delivery in over["deliveries"]:                  #loop through the deliveries (6 per over)
                delivery_data = {                                #store the match data in a dictionary
                    "team": team,                                #this includes bowler, batter and non striker on each delivery
                    "over": over_number,                         #runs scored by batter, extras, total runs scored and wickets
                    "batter": delivery["batter"],
                    "bowler": delivery["bowler"],
                    "non_striker": delivery["non_striker"],
                    "runs_batter": delivery["runs"]["batter"],
                    "runs_extras": delivery["runs"].get("extras", 0),
                    "runs_total": delivery["runs"]["total"],
                    "extras": delivery.get("extras", {}),
                    "wicket": delivery.get("wickets", []),
                }
                innings_data.append(delivery_data)
    
    i += 1
    games.append({
        "game": game,
        "result": result,
        "innings": innings_data
    })

df = pd.DataFrame(games)

df.head(5)


Unnamed: 0,game,result,innings
0,1,"{'by': {'wickets': 4}, 'winner': 'Gloucestersh...","[{'team': 'Somerset', 'over': 0, 'batter': 'CH..."
1,2,"{'winner': 'Gloucestershire', 'by': {'wickets'...","[{'team': 'Glamorgan', 'over': 0, 'batter': 'C..."
2,3,"{'winner': 'Glamorgan', 'by': {'runs': 18}, 'm...","[{'team': 'Glamorgan', 'over': 0, 'batter': 'D..."
3,4,"{'winner': 'Middlesex', 'by': {'runs': 33}}","[{'team': 'Middlesex', 'over': 0, 'batter': 'P..."
4,5,{'result': 'draw'},"[{'team': 'Gloucestershire', 'over': 0, 'batte..."


In [2]:
from pandas import json_normalize

# Normalize the 'innings' data
normalized_data = []
for game in games:
    for inning in game['innings']:
        inning['game'] = game['game']
        inning['result'] = game['result']
        normalized_data.append(inning)

# Create a DataFrame from the normalized data
normalized_df = json_normalize(normalized_data)

# Display the normalized DataFrame
normalized_df.head()

Unnamed: 0,team,over,batter,bowler,non_striker,runs_batter,runs_extras,runs_total,wicket,game,...,result.winner,extras.wides,extras.legbyes,extras.noballs,extras.byes,result.by.runs,result.method,result.result,result.by.innings,extras.penalty
0,Somerset,0,CH Gayle,MD Taylor,J Allenby,0,0,0,[],1,...,Gloucestershire,,,,,,,,,
1,Somerset,0,CH Gayle,MD Taylor,J Allenby,0,0,0,[],1,...,Gloucestershire,,,,,,,,,
2,Somerset,0,CH Gayle,MD Taylor,J Allenby,0,0,0,[],1,...,Gloucestershire,,,,,,,,,
3,Somerset,0,CH Gayle,MD Taylor,J Allenby,0,0,0,[],1,...,Gloucestershire,,,,,,,,,
4,Somerset,0,CH Gayle,MD Taylor,J Allenby,0,0,0,[],1,...,Gloucestershire,,,,,,,,,


In [3]:
df = normalized_df

df['deliveries'] = df.groupby(['game', 'team']).cumcount() + 1


In [4]:
df.dtypes

team                  object
over                   int64
batter                object
bowler                object
non_striker           object
runs_batter            int64
runs_extras            int64
runs_total             int64
wicket                object
game                   int64
result.by.wickets    float64
result.winner         object
extras.wides         float64
extras.legbyes       float64
extras.noballs       float64
extras.byes          float64
result.by.runs       float64
result.method         object
result.result         object
result.by.innings    float64
extras.penalty       float64
deliveries             int64
dtype: object

In [5]:
team_mapping = {team: idx for idx, team in enumerate(df['team'].unique(), start=1)}  #maps each team to a unique id

df['team_id'] = df['team'].map(team_mapping)                                         #maps team to unique id

df = df.drop(columns=['team'])                                                       #drops the team column

In [6]:
unique_players = pd.concat([df['batter'], df['bowler'], df['non_striker']]).unique() #gets list of unique players
player_mapping = {player: idx for idx, player in enumerate(unique_players, start=1)} #maps each player to a unique id

df["batter_id"] = df["batter"].map(player_mapping)                                   #maps batter to unique id
df["bowler_id"] = df["bowler"].map(player_mapping)                                   #maps bowler to unique id
df["non_striker_id"] = df["non_striker"].map(player_mapping)                         #maps non striker to unique id

df.drop(columns=["batter", "bowler", "non_striker"], inplace=True)                   #drops the batter, bowler and non striker columns
df.head()

Unnamed: 0,over,runs_batter,runs_extras,runs_total,wicket,game,result.by.wickets,result.winner,extras.wides,extras.legbyes,...,result.by.runs,result.method,result.result,result.by.innings,extras.penalty,deliveries,team_id,batter_id,bowler_id,non_striker_id
0,0,0,0,0,[],1,4.0,Gloucestershire,,,...,,,,,,1,1,1,87,2
1,0,0,0,0,[],1,4.0,Gloucestershire,,,...,,,,,,2,1,1,87,2
2,0,0,0,0,[],1,4.0,Gloucestershire,,,...,,,,,,3,1,1,87,2
3,0,0,0,0,[],1,4.0,Gloucestershire,,,...,,,,,,4,1,1,87,2
4,0,0,0,0,[],1,4.0,Gloucestershire,,,...,,,,,,5,1,1,87,2


In [7]:
df['is_wicket'] = df['wicket'].apply(lambda x: 1 if x else 0)              #create a column to track if a wicket has fallen
df['cumulative_wickets'] = df.groupby(['game','team_id'])['is_wicket'].cumsum()     #create a column to track the cumulative wickets
aggressive_types = ['caught', 'stumped']                                   #list of aggressive wicket types
df['aggressive_dismissal_flag'] = df['wicket'].apply(                      #create a column to track if an aggressive wicket has fallen
    lambda x: 1 if any(w['kind'] in aggressive_types for w in x) else 0
)

df.drop(columns=['wicket'], inplace=True)                                  #drop the wicket column
df

Unnamed: 0,over,runs_batter,runs_extras,runs_total,game,result.by.wickets,result.winner,extras.wides,extras.legbyes,extras.noballs,...,result.by.innings,extras.penalty,deliveries,team_id,batter_id,bowler_id,non_striker_id,is_wicket,cumulative_wickets,aggressive_dismissal_flag
0,0,0,0,0,1,4.0,Gloucestershire,,,,...,,,1,1,1,87,2,0,0,0
1,0,0,0,0,1,4.0,Gloucestershire,,,,...,,,2,1,1,87,2,0,0,0
2,0,0,0,0,1,4.0,Gloucestershire,,,,...,,,3,1,1,87,2,0,0,0
3,0,0,0,0,1,4.0,Gloucestershire,,,,...,,,4,1,1,87,2,0,0,0
4,0,0,0,0,1,4.0,Gloucestershire,,,,...,,,5,1,1,87,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320209,47,0,0,0,370,3.0,Gloucestershire,,,,...,,,738,2,47,280,48,0,17,0
320210,47,4,0,4,370,3.0,Gloucestershire,,,,...,,,739,2,47,280,48,0,17,0
320211,47,0,0,0,370,3.0,Gloucestershire,,,,...,,,740,2,47,280,48,0,17,0
320212,48,0,0,0,370,3.0,Gloucestershire,,,,...,,,741,2,48,770,47,0,17,0


In [8]:
df.fillna(0, inplace=True)                                                #fill any NaN values with 0
df.corr().style.background_gradient(cmap='coolwarm')

  df.corr().style.background_gradient(cmap='coolwarm')


Unnamed: 0,over,runs_batter,runs_extras,runs_total,game,result.by.wickets,extras.wides,extras.legbyes,extras.noballs,extras.byes,result.by.runs,result.by.innings,extras.penalty,deliveries,team_id,batter_id,bowler_id,non_striker_id,is_wicket,cumulative_wickets,aggressive_dismissal_flag
over,1.0,-0.047596,-0.01633,-0.050814,-0.0106,-0.077596,-0.036214,-0.009425,-0.002923,0.007076,-0.000861,0.090976,0.000219,0.633605,0.040831,0.044588,-0.046543,0.039206,-0.017414,0.335192,-0.015426
runs_batter,-0.047596,1.0,-0.052917,0.967818,-0.00207,0.006209,-0.032717,-0.040646,-0.007229,-0.026344,-0.020596,-0.033356,-0.001574,-0.068959,0.007443,-0.002276,-0.001874,0.001147,-0.077572,-0.054825,-0.061548
runs_extras,-0.01633,-0.052917,1.0,0.200086,-0.003255,0.004956,0.364387,0.554532,0.505599,0.545354,-0.007747,-0.006985,0.070204,-0.015678,-0.00061,-0.001211,0.011434,-0.000592,-0.019149,-0.011045,-0.014789
runs_total,-0.050814,0.967818,0.200086,1.0,-0.002852,0.007341,0.059727,0.099865,0.120321,0.111585,-0.02216,-0.034488,0.016148,-0.07161,0.007149,-0.002538,0.001042,0.000976,-0.080935,-0.056575,-0.064115
game,-0.0106,-0.00207,-0.003255,-0.002852,1.0,0.006168,-0.005931,9.9e-05,0.000132,-0.002039,-0.124062,-0.050382,-0.000969,-0.002531,-0.004796,0.089032,0.058144,0.087258,-0.002062,0.023799,0.001359
result.by.wickets,-0.077596,0.006209,0.004956,0.007341,0.006168,1.0,0.00439,0.003863,0.002993,-0.000299,-0.315019,-0.195861,-0.00248,-0.066012,-0.029776,0.00061,-0.004977,0.006816,0.003842,-0.007836,0.002173
extras.wides,-0.036214,-0.032717,0.364387,0.059727,-0.005931,0.00439,1.0,-0.004891,-0.004734,-0.00317,-0.010209,-0.011628,-0.000271,-0.042404,0.001566,0.000145,0.005957,0.001012,-0.007576,-0.030952,-0.005125
extras.legbyes,-0.009425,-0.040646,0.554532,0.099865,9.9e-05,0.003863,-0.004891,1.0,-0.002113,-0.003938,-0.000676,-0.004362,-0.000337,-0.010883,0.002137,0.002715,0.003242,-0.000377,-0.011853,-0.007171,-0.009201
extras.noballs,-0.002923,-0.007229,0.505599,0.120321,0.000132,0.002993,-0.004734,-0.002113,1.0,0.012652,-0.008428,-0.003363,-0.000326,-0.000245,-0.002217,-0.001947,0.009563,-0.002524,-0.010944,-0.001051,-0.008905
extras.byes,0.007076,-0.026344,0.545354,0.111585,-0.002039,-0.000299,-0.00317,-0.003938,0.012652,1.0,0.001021,0.002839,-0.000218,0.011511,-0.002687,-0.003607,0.004983,0.00094,-0.007682,0.009004,-0.005963
