In [254]:
#importing Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#Preprocessing

## Dataset

In [None]:
deli = pd.read_csv("/content/deliveries.csv")

In [None]:
deli.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,


In [None]:
matches = pd.read_csv("/content/matches.csv")
matches.head(1)

Unnamed: 0,season,team1,team2,date,match_number,venue,city,toss_winner,toss_decision,player_of_match,umpire1,umpire2,reserve_umpire,match_referee,winner,winner_runs,winner_wickets,match_type
0,2023/24,England,New Zealand,2023/10/05,1,Narendra Modi Stadium,Ahmedabad,New Zealand,field,R Ravindra,HDPK Dharmasena,Nitin Menon,Sharfuddoula,J Srinath,New Zealand,,9.0,Group


In [None]:
points = pd.read_csv("/content/points_table.csv")
points

Unnamed: 0,Ranking,Team,Matches,Won,Lost,Tie,No Results,Points,Net Run Rate,Series Form,Next Match,For,Against
0,1,South Africa,7,6,1,0,0,12,2.29,LWWWW,"vs IND, AFG",2355/340.2,1588/343.0
1,2,India,6,6,0,0,0,12,1.405,WWWWW,"vs SL, SA, NED",1430/246.2,1320/300.0
2,3,Australia,6,4,2,0,0,8,0.97,LWWWW,"vs ENG, AFG, BAN",1745/285.2,1499/291.2
3,4,New Zealand,7,4,3,0,0,8,0.484,WWLLL,"vs PAK, SL",1964/329.1,1908/348.0
4,5,Pakistan,7,3,4,0,0,6,-0.024,LLLLW,"vs NZ, ENG",1884/330.5,1869/326.5
5,6,Afghanistan,6,3,3,0,0,6,-0.718,LWLWW,"vs NED, AUS, SA",1379/294.2,1457/269.4
6,7,Sri Lanka,6,2,4,0,0,4,-0.275,LLWWL,"vs IND, BAN, NZ",1543/274.0,1648/279.0
7,8,Netherlands,6,2,4,0,0,4,-1.277,LWLLW,"vs AFG, ENG, IND",1254/293.0,1619/291.2
8,9,Bangladesh,7,1,6,0,0,2,-1.446,LLLLL,"vs SL, AUS",1465/334.4,1845/316.5
9,10,England,6,1,5,0,0,2,-1.652,WLLLL,"vs AUS, NED, PAK",1316/300.0,1582/262.0


In [None]:
deli.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [None]:
matches.columns

Index(['season', 'team1', 'team2', 'date', 'match_number', 'venue', 'city',
       'toss_winner', 'toss_decision', 'player_of_match', 'umpire1', 'umpire2',
       'reserve_umpire', 'match_referee', 'winner', 'winner_runs',
       'winner_wickets', 'match_type'],
      dtype='object')

## Missing value handling

In [None]:
#renaming columns in matches
matches.rename(columns={"match_number":"match_id", "start_date":"date"}, inplace=True)

In [None]:
deli.isna().sum()

match_id                      0
season                        0
start_date                    0
venue                         0
innings                       0
ball                          0
batting_team                  0
bowling_team                  0
striker                       0
non_striker                   0
bowler                        0
runs_off_bat                  0
extras                        0
wides                     16949
noballs                   17338
byes                      17354
legbyes                   17263
penalty                   17379
wicket_type               16889
player_dismissed          16889
other_wicket_type         17380
other_player_dismissed    17380
dtype: int64

In [None]:
#creating a wicket column to get boolean of wicket fall
deli["wicket"] = deli["player_dismissed"].apply(lambda x: 1 if isinstance(x, str) else 0)

In [None]:
deli["wicket"]

0        0
1        0
2        0
3        0
4        0
        ..
17375    0
17376    0
17377    0
17378    0
17379    1
Name: wicket, Length: 17380, dtype: int64

In [None]:
#filling na values
deli[["wides", "noballs", "byes", "legbyes", "penalty"]] = deli[["wides", "noballs", "byes", "legbyes", "penalty"]].fillna(0)

In [None]:
deli.wicket_type.fillna("no wicket", inplace=True)

In [None]:
deli.player_dismissed.fillna("no wicket", inplace=True)

In [None]:
deli[["other_wicket_type", "other_player_dismissed"]] = deli[["other_wicket_type", "other_player_dismissed"]].fillna("no dismissal")

In [None]:
deli.isna().sum()

match_id                  0
season                    0
start_date                0
venue                     0
innings                   0
ball                      0
batting_team              0
bowling_team              0
striker                   0
non_striker               0
bowler                    0
runs_off_bat              0
extras                    0
wides                     0
noballs                   0
byes                      0
legbyes                   0
penalty                   0
wicket_type               0
player_dismissed          0
other_wicket_type         0
other_player_dismissed    0
wicket                    0
dtype: int64

In [None]:
matches.isna().sum()

season              0
team1               0
team2               0
date                0
match_id            0
venue               0
city                0
toss_winner         0
toss_decision       0
player_of_match     0
umpire1             0
umpire2             0
reserve_umpire      1
match_referee       0
winner              0
winner_runs        16
winner_wickets     16
match_type          0
dtype: int64

In [None]:
matches.rename(columns={"winner_runs":"win_by_runs", "winner_wickets":"win_by_wickets"}, inplace=True)

In [None]:
matches.win_by_runs.fillna(0, inplace=True)

In [None]:
matches.win_by_wickets.fillna(0, inplace=True)

In [None]:
matches.reserve_umpire.fillna("unknown", inplace=True)

In [None]:
matches.isna().sum()

season             0
team1              0
team2              0
date               0
match_id           0
venue              0
city               0
toss_winner        0
toss_decision      0
player_of_match    0
umpire1            0
umpire2            0
reserve_umpire     0
match_referee      0
winner             0
win_by_runs        0
win_by_wickets     0
match_type         0
dtype: int64

## Data Aggregation

In [None]:
#creating over and balls from ball column which is in 0.1, 0.2 format for first over
deli["ball"] = deli["ball"].astype(str)
deli[["over", "ball_num"]] = deli["ball"].str.split(".", expand=True).astype(int)
deli["ball"] = deli["ball"].astype(float)
deli["over"] = deli["over"] + 1
deli["ball_left"] = 306 - (deli["over"]*6 + deli["ball_num"])

In [None]:
#creating total runs column to calculate runs on each ball incusive of extras
deli["total_runs"] = deli["runs_off_bat"] + deli["extras"]

## Innings Dataset

In [None]:
#creating innings df so we can merger total runs for each innings with in original df
innings = deli.groupby(["match_id", "innings"])["total_runs"].sum().reset_index()
innings.rename(columns={"total_runs":"innings_total"}, inplace=True)

In [None]:
innings

Unnamed: 0,match_id,innings,innings_total
0,1,1,282
1,1,2,283
2,2,1,286
3,2,2,205
4,3,1,156
...,...,...,...
59,30,2,242
60,31,1,204
61,31,2,205
62,32,1,357


In [None]:
#creating deliveries df and adding total runs
deliveries = deli.merge(innings, on=["match_id", "innings"])
deliveries.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,wicket,over,ball_num,ball_left,total_runs,innings_total
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,no wicket,no wicket,no dismissal,no dismissal,0,1,1,299,0,282


In [None]:
#calculating cumulative runs
deliveries["cumulative_runs"] = deliveries.groupby(["match_id", "innings"])["total_runs"].cumsum()

In [None]:
deliveries.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,player_dismissed,other_wicket_type,other_player_dismissed,wicket,over,ball_num,ball_left,total_runs,innings_total,cumulative_runs
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,no wicket,no dismissal,no dismissal,0,1,1,299,0,282,0


In [None]:
#creating run rate column
deliveries["run_rate"] = deliveries.groupby(["match_id","innings"]).apply(lambda x: (x["cumulative_runs"]*6)/(300-x["ball_left"])).reset_index(level=[0,1], drop=True)

In [None]:
deliveries.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,other_wicket_type,other_player_dismissed,wicket,over,ball_num,ball_left,total_runs,innings_total,cumulative_runs,run_rate
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,no dismissal,no dismissal,0,1,1,299,0,282,0,0.0


In [None]:
#filtering for innings1 and 2 so we can add some columns respectively
innings_1 = deliveries[deliveries["innings"] == 1]

innings_1 = innings_1.groupby("match_id")["cumulative_runs"].max().reset_index()
innings_1.rename(columns={"cumulative_runs": "target"}, inplace=True)

innings_2 = deliveries[deliveries["innings"] == 2]
innings_2 = innings_2.merge(innings_1, on="match_id", how="left")
innings_2["target"] = innings_2["target"]+1

In [None]:
innings_1

Unnamed: 0,match_id,target
0,1,282
1,2,286
2,3,156
3,4,428
4,5,199
5,6,322
6,7,364
7,8,344
8,9,272
9,10,311


In [None]:
innings_2

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,other_player_dismissed,wicket,over,ball_num,ball_left,total_runs,innings_total,cumulative_runs,run_rate,target
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",2,0.1,New Zealand,England,DP Conway,WA Young,...,no dismissal,0,1,1,299,4,283,4,24.000000,283
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",2,0.2,New Zealand,England,DP Conway,WA Young,...,no dismissal,0,1,2,298,0,283,4,12.000000,283
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",2,0.3,New Zealand,England,DP Conway,WA Young,...,no dismissal,0,1,3,297,0,283,4,8.000000,283
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",2,0.4,New Zealand,England,DP Conway,WA Young,...,no dismissal,0,1,4,296,2,283,6,9.000000,283
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",2,0.5,New Zealand,England,DP Conway,WA Young,...,no dismissal,0,1,5,295,4,283,10,12.000000,283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7882,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.5,New Zealand,South Africa,MJ Henry,GD Phillips,...,no dismissal,0,35,5,91,0,167,161,4.622010,358
7883,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.6,New Zealand,South Africa,MJ Henry,GD Phillips,...,no dismissal,0,35,6,90,0,167,161,4.600000,358
7884,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.1,New Zealand,South Africa,GD Phillips,MJ Henry,...,no dismissal,0,36,1,89,0,167,161,4.578199,358
7885,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.2,New Zealand,South Africa,GD Phillips,MJ Henry,...,no dismissal,0,36,2,88,6,167,167,4.726415,358


In [None]:
#adding runs needed and rrr to innings 2
innings_2["runs_needed"] = innings_2["target"] - innings_2["cumulative_runs"]
innings_2["required_run_rate"] = (innings_2["runs_needed"]*6) / innings_2["ball_left"]

In [None]:
#adding wickets remaining to innings 2
innings_2["wickets_remaining"] = innings_2.groupby("match_id")["wicket"].apply(lambda x: 10 - x.cumsum()).reset_index(level=0, drop=True)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  innings_2["wickets_remaining"] = innings_2.groupby("match_id")["wicket"].apply(lambda x: 10 - x.cumsum()).reset_index(level=0, drop=True)


In [None]:
#innings 1 data
innings_1_f = deliveries[deliveries["innings"] == 1]

In [None]:
#adding wickets remaining
innings_1_f["wickets_remaining"] = innings_1_f.groupby("match_id")["wicket"].apply(lambda x: 10 - x.cumsum()).reset_index(level=0, drop=True)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  innings_1_f["wickets_remaining"] = innings_1_f.groupby("match_id")["wicket"].apply(lambda x: 10 - x.cumsum()).reset_index(level=0, drop=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  innings_1_f["wickets_remaining"] = innings_1_f.groupby("match_id")["wicket"].apply(lambda x: 10 - x.cumsum()).reset_index(level=0, drop=True)


In [None]:
#concatinating innings1,2
innings = pd.concat([innings_1_f,innings_2])

In [None]:
innings

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,ball_num,ball_left,total_runs,innings_total,cumulative_runs,run_rate,wickets_remaining,target,runs_needed,required_run_rate
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,1,299,0,282,0,0.000000,10.0,,,
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,...,2,298,6,282,6,18.000000,10.0,,,
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,...,3,297,1,282,7,14.000000,10.0,,,
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,...,4,296,1,282,8,12.000000,10.0,,,
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,...,5,295,4,282,12,14.400000,10.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7882,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.5,New Zealand,South Africa,MJ Henry,GD Phillips,...,5,91,0,167,161,4.622010,1.0,358.0,197.0,12.989011
7883,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.6,New Zealand,South Africa,MJ Henry,GD Phillips,...,6,90,0,167,161,4.600000,1.0,358.0,197.0,13.133333
7884,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.1,New Zealand,South Africa,GD Phillips,MJ Henry,...,1,89,0,167,161,4.578199,1.0,358.0,197.0,13.280899
7885,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.2,New Zealand,South Africa,GD Phillips,MJ Henry,...,2,88,6,167,167,4.726415,1.0,358.0,191.0,13.022727


In [None]:
innings.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'wicket', 'over', 'ball_num', 'ball_left',
       'total_runs', 'innings_total', 'cumulative_runs', 'run_rate',
       'wickets_remaining', 'target', 'runs_needed', 'required_run_rate'],
      dtype='object')

In [None]:
len(innings)

17380

In [None]:
innings.fillna(0, inplace=True)

In [None]:
#merging with matches data to get winners and other data that can help our model
innings = innings.merge(matches[["match_id","winner", "win_by_runs", "win_by_wickets"]], on="match_id")

In [None]:
matches.columns

Index(['season', 'team1', 'team2', 'date', 'match_id', 'venue', 'city',
       'toss_winner', 'toss_decision', 'player_of_match', 'umpire1', 'umpire2',
       'reserve_umpire', 'match_referee', 'winner', 'win_by_runs',
       'win_by_wickets', 'match_type'],
      dtype='object')

In [None]:
#adding result column
innings["result"] = innings.apply(lambda x: 1 if x["batting_team"] == x["winner"] else 0, axis=1)

In [None]:
#making the data random to avoid overfitting
# innings = innings.sample(frac=1)

In [None]:
innings

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,cumulative_runs,run_rate,wickets_remaining,target,runs_needed,required_run_rate,winner,win_by_runs,win_by_wickets,result
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,0,0.000000,10.0,0.0,0.0,0.000000,New Zealand,0.0,9.0,0
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,...,6,18.000000,10.0,0.0,0.0,0.000000,New Zealand,0.0,9.0,0
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,...,7,14.000000,10.0,0.0,0.0,0.000000,New Zealand,0.0,9.0,0
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,...,8,12.000000,10.0,0.0,0.0,0.000000,New Zealand,0.0,9.0,0
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,...,12,14.400000,10.0,0.0,0.0,0.000000,New Zealand,0.0,9.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17375,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.5,New Zealand,South Africa,MJ Henry,GD Phillips,...,161,4.622010,1.0,358.0,197.0,12.989011,South Africa,190.0,0.0,0
17376,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.6,New Zealand,South Africa,MJ Henry,GD Phillips,...,161,4.600000,1.0,358.0,197.0,13.133333,South Africa,190.0,0.0,0
17377,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.1,New Zealand,South Africa,GD Phillips,MJ Henry,...,161,4.578199,1.0,358.0,197.0,13.280899,South Africa,190.0,0.0,0
17378,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.2,New Zealand,South Africa,GD Phillips,MJ Henry,...,167,4.726415,1.0,358.0,191.0,13.022727,South Africa,190.0,0.0,0


In [None]:
#defining model data for prediction
model_cols = innings[["venue", "batting_team", "bowling_team", "ball", "innings_total", "wickets_remaining", "run_rate", "required_run_rate",
                      "ball_left", "runs_needed", "win_by_runs", "win_by_wickets", "target", "result"]]

In [None]:
# Assuming your DataFrame is named 'kohli'
innings.to_csv('/content/innings_data.csv', index=False)


In [None]:
model_cols.isna().sum()

venue                0
batting_team         0
bowling_team         0
ball                 0
innings_total        0
wickets_remaining    0
run_rate             0
required_run_rate    0
ball_left            0
runs_needed          0
win_by_runs          0
win_by_wickets       0
target               0
result               0
dtype: int64

# Player performance - Virat Kohli



## Creating dataframe were Virat Kohli is striker

In [None]:
#data for Virat Kohli

kohli = innings[innings["striker"] == "V Kohli"]

In [None]:
kohli.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,cumulative_runs,run_rate,wickets_remaining,target,runs_needed,required_run_rate,winner,win_by_runs,win_by_wickets,result
2425,5,2023/24,2023-10-08,"MA Chidambaram Stadium, Chepauk, Chennai",2,0.6,India,Australia,V Kohli,RG Sharma,...,2,2.0,9.0,200.0,198.0,4.040816,India,0.0,6.0,1


In [None]:
#Virat Kohli's innings score
kohli["innings_score"] = kohli.groupby("match_id")["runs_off_bat"].cumsum()

#Kohli's balls faced till now
kohli["balls_faced"] = kohli.groupby("match_id").cumcount() + 1

#Number of times Kohli got out till now
kohli["times_out"] = kohli["wicket"].cumsum()

#Total runs of Kohli till now
kohli["total_kohli_runs"] = kohli["runs_off_bat"].cumsum()

#yotal balls faced by Kohli till now
kohli["total_kohli_balls"] = range(1, len(kohli) + 1)

#Virat Kohli's average (handle division by zero)
kohli["average"] = kohli["total_kohli_runs"] / np.where(kohli["times_out"] > 0, kohli["times_out"], 1)

#Virat Kohli's strike rate
kohli["strike_rate"] = (kohli["total_kohli_runs"] * 100) / kohli["total_kohli_balls"]

kohli["average"] = kohli["average"].astype(float)
kohli["strike_rate"] = kohli["strike_rate"].astype(float)

kohli.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kohli["innings_score"] = kohli.groupby("match_id")["runs_off_bat"].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kohli["balls_faced"] = kohli.groupby("match_id").cumcount() + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kohli["times_out"] = kohli["wicket"].cumsum()
A value is trying t

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,win_by_runs,win_by_wickets,result,innings_score,balls_faced,times_out,total_kohli_runs,total_kohli_balls,average,strike_rate
2425,5,2023/24,2023-10-08,"MA Chidambaram Stadium, Chepauk, Chennai",2,0.6,India,Australia,V Kohli,RG Sharma,...,0.0,6.0,1,0,1,0,0,1,0.0,0.0
2426,5,2023/24,2023-10-08,"MA Chidambaram Stadium, Chepauk, Chennai",2,0.7,India,Australia,V Kohli,RG Sharma,...,0.0,6.0,1,0,2,0,0,2,0.0,0.0
2433,5,2023/24,2023-10-08,"MA Chidambaram Stadium, Chepauk, Chennai",2,2.1,India,Australia,V Kohli,KL Rahul,...,0.0,6.0,1,2,3,0,2,3,2.0,66.666667
2434,5,2023/24,2023-10-08,"MA Chidambaram Stadium, Chepauk, Chennai",2,2.2,India,Australia,V Kohli,KL Rahul,...,0.0,6.0,1,2,4,0,2,4,2.0,50.0
2435,5,2023/24,2023-10-08,"MA Chidambaram Stadium, Chepauk, Chennai",2,2.3,India,Australia,V Kohli,KL Rahul,...,0.0,6.0,1,2,5,0,2,5,2.0,40.0


In [None]:
kohli.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'wicket', 'over', 'ball_num', 'ball_left',
       'total_runs', 'innings_total', 'cumulative_runs', 'run_rate',
       'wickets_remaining', 'target', 'runs_needed', 'required_run_rate',
       'winner', 'win_by_runs', 'win_by_wickets', 'result', 'innings_score',
       'balls_faced', 'times_out', 'total_kohli_runs', 'total_kohli_balls',
       'average', 'strike_rate'],
      dtype='object')

In [None]:
kohli["team_total"] = kohli["innings_total"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kohli["team_total"] = kohli["innings_total"]


In [None]:
kohli = kohli[["match_id", "venue", "innings", "bowling_team", "runs_off_bat", "ball_left", "run_rate", "team_total", "wickets_remaining", "target", "runs_needed","required_run_rate" ,"average", "strike_rate"]]

In [None]:
kohli.isna().sum()

match_id             0
venue                0
innings              0
bowling_team         0
runs_off_bat         0
ball_left            0
run_rate             0
team_total           0
wickets_remaining    0
target               0
runs_needed          0
required_run_rate    0
average              0
strike_rate          0
dtype: int64

In [None]:
# Check the number of rows with inf values in the entire DataFrame
num_rows_with_inf = (kohli.select_dtypes(include=[np.number]).applymap(np.isinf)).any(axis=1).sum()

print("Number of rows with inf values:", num_rows_with_inf)



Number of rows with inf values: 0


In [None]:
# one hot encoding on venue and bowling team
kohli = pd.get_dummies(kohli, columns=["venue", "bowling_team"])
# kohli = kohli.drop(["venue", "bowling_team"], axis=1)

In [None]:
kohli.columns

Index(['match_id', 'innings', 'runs_off_bat', 'ball_left', 'run_rate',
       'team_total', 'wickets_remaining', 'target', 'runs_needed',
       'required_run_rate', 'average', 'strike_rate',
       'venue_Arun Jaitley Stadium, Delhi',
       'venue_Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow',
       'venue_Himachal Pradesh Cricket Association Stadium, Dharamsala',
       'venue_MA Chidambaram Stadium, Chepauk, Chennai',
       'venue_Maharashtra Cricket Association Stadium, Pune',
       'venue_Narendra Modi Stadium, Ahmedabad', 'bowling_team_Afghanistan',
       'bowling_team_Australia', 'bowling_team_Bangladesh',
       'bowling_team_England', 'bowling_team_New Zealand',
       'bowling_team_Pakistan'],
      dtype='object')

In [None]:
# Extract the correlation of 'runs_off_bat' with every other column
correlation_with_runs = correlation_matrix['runs_off_bat'].sort_values(ascending=False)

# Print the correlation values
print(correlation_with_runs)


runs_off_bat                                                                   1.000000
strike_rate                                                                    0.132047
target                                                                         0.130968
run_rate                                                                       0.127525
innings                                                                        0.105996
average                                                                        0.097578
team_total                                                                     0.088208
bowling_team_Bangladesh                                                        0.085012
venue_Maharashtra Cricket Association Stadium, Pune                            0.085012
wickets_remaining                                                              0.072129
venue_Arun Jaitley Stadium, Delhi                                              0.032652
bowling_team_Afghanistan        

In [None]:
# Separate target variable and features
kohli_train = kohli.drop(["runs_off_bat"], axis=1)
kohli_test = kohli["runs_off_bat"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(kohli_train, kohli_test, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(327, 24)

In [None]:
X_test.shape

(82, 24)

In [None]:
X_train

Unnamed: 0,match_id,innings,ball_left,run_rate,team_total,wickets_remaining,target,runs_needed,required_run_rate,average,...,"venue_MA Chidambaram Stadium, Chepauk, Chennai","venue_Maharashtra Cricket Association Stadium, Pune","venue_Narendra Modi Stadium, Ahmedabad",bowling_team_Afghanistan,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_New Zealand,bowling_team_Pakistan,runs_per_ball
2631,5,2,93,4.318841,201,7.0,200.0,51.0,3.290323,79.0,...,1,0,0,0,1,0,0,0,0,0.009174
10891,21,2,46,5.669291,274,5.0,274.0,34.0,4.434783,167.0,...,0,0,0,0,0,0,0,1,0,0.000000
8586,17,2,130,6.141176,261,8.0,257.0,83.0,3.830769,104.0,...,0,1,0,0,0,1,0,0,0,0.000000
4993,9,2,90,7.800000,273,8.0,273.0,0.0,0.000000,140.0,...,0,0,0,1,0,0,0,0,0,0.022727
2434,5,2,286,1.714286,201,7.0,200.0,196.0,4.111888,2.0,...,1,0,0,0,1,0,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2567,5,2,154,3.863014,201,7.0,200.0,106.0,4.129870,46.0,...,1,0,0,0,1,0,0,0,0,0.013889
2628,5,2,96,4.294118,201,7.0,200.0,54.0,3.375000,77.0,...,1,0,0,0,1,0,0,0,0,0.009346
8643,17,2,74,6.026549,261,7.0,257.0,30.0,2.432432,114.0,...,0,1,0,0,0,1,0,0,0,0.003690
10815,21,2,121,5.597765,274,7.0,274.0,107.0,5.305785,150.5,...,0,0,0,0,0,0,0,1,0,0.002865


## Regression for predicting run scored by Virat Kohli

In [258]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Separate target variable and features
X = kohli.drop(["runs_off_bat"], axis=1)
y = kohli["runs_off_bat"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for RandomForestRegressor
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Include any preprocessing steps if needed
    ('regressor', RandomForestRegressor())
])

# Create a pipeline for Linear Regression
linear_params = {}
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Include any preprocessing steps if needed
    ('regressor', LinearRegression())
])

# Define hyperparameters to tune for RandomForestRegressor
rf_params = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [5, 10, 20]
}

# Create a pipeline for XGBRegressor
xgb_pipeline = Pipeline([
    ('regressor', XGBRegressor())
])

# Define hyperparameters to tune for XGBRegressor
xgb_params = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [5, 10, 20]
}

# Create a pipeline for MLPRegressor (Neural Network)
mlp_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize input features for Neural Network
    ('regressor', MLPRegressor())
])

# Define hyperparameters to tune for MLPRegressor
mlp_params = {
    'regressor__hidden_layer_sizes': [(50, 50), (100, 50, 100)],
    'regressor__max_iter': [100, 200, 300],
}

# Create a dictionary of models and their respective hyperparameters
models = {
    'LinearRegression': (linear_pipeline, linear_params),
    'RandomForest': (rf_pipeline, rf_params),
    'XGBoost': (xgb_pipeline, xgb_params),
    'NeuralNetwork': (mlp_pipeline, mlp_params),
}

# Loop through each model, perform grid search for hyperparameter tuning, and print results
for model_name, (pipeline, params) in models.items():
    print(f"\nTraining and evaluating {model_name} as a regression problem...")
    grid_search = GridSearchCV(pipeline, params, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train)

    # Print best parameters and score
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best mean squared error: {-grid_search.best_score_}")

    # Evaluate the model on the test set
    y_pred = grid_search.predict(X_test)
    print("Test results")
    print(f"{model_name}'s predicted runs: {y_pred.sum()}")
    print(f"{model_name}'s actual runs: {y_test.sum()}")



Training and evaluating LinearRegression as a regression problem...
Best parameters for LinearRegression: {}
Best mean squared error: 4.550011388176189
Test results
LinearRegression's predicted runs: 69.47783023030684
LinearRegression's actual runs: 59

Training and evaluating RandomForest as a regression problem...
Best parameters for RandomForest: {'regressor__max_depth': 10, 'regressor__n_estimators': 50}
Best mean squared error: 0.1739497808857809
Test results
RandomForest's predicted runs: 58.1
RandomForest's actual runs: 59

Training and evaluating XGBoost as a regression problem...
Best parameters for XGBoost: {'regressor__max_depth': 5, 'regressor__n_estimators': 200}
Best mean squared error: 0.1357182851654382
Test results
XGBoost's predicted runs: 58.17915725708008
XGBoost's actual runs: 59

Training and evaluating NeuralNetwork as a regression problem...




Best parameters for NeuralNetwork: {'regressor__hidden_layer_sizes': (50, 50), 'regressor__max_iter': 200}
Best mean squared error: 2.4304867870494724
Test results
NeuralNetwork's predicted runs: 58.77629452066582
NeuralNetwork's actual runs: 59


