# Making data ready for preprocessing


In [1]:
import pandas as pd

import numpy as np 

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv("../derived/final_stats_data.csv",low_memory=False)
df.columns

Index(['ID', 'innings', 'overs', 'ballnumber', 'batter', 'bowler',
       'non-striker', 'extra_type', 'batsman_run', 'extras_run', 'total_run',
       'non_boundary', 'isWicketDelivery', 'player_out', 'kind',
       'fielders_involved', 'BattingTeam', 'City', 'Date', 'Season',
       'MatchNumber', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision',
       'SuperOver', 'WinningTeam', 'WonBy', 'Margin', 'method',
       'Player_of_Match', 'Team1Players', 'Team2Players', 'Umpire1', 'Umpire2',
       'BowlingTeam', 'batter_matches_played', 'runs_scored', 'dismissals',
       'balls_faced', '0s_scored', '1s_scored', '2s_scored', '4s_scored',
       '6s_scored', 'high_score', '25_scored', '50_scored', '75_scored',
       '100_scored', 'strike_rate_x', 'batting_average', 'notout',
       'explosivity_rating', '0_wickets_taken', '1_wickets_taken',
       '2_wickets_taken', '3_wickets_taken', '4_wickets_taken',
       '5_wickets_taken', '6_wickets_taken', 'bowler_matches_played',
       

In [5]:
df.drop(['ID','non-striker', 'extra_type',
       'non_boundary', 'player_out', 'kind',
       'fielders_involved', 'City', 'Season',
       'MatchNumber','SuperOver','WonBy', 'Margin', 'method',
       'Player_of_Match', 'Team1Players', 'Team2Players', 'Umpire1', 'Umpire2','WinningTeam', 'Team2', 'extras_run','batsman_run'], axis = 1, inplace = True)

In [6]:
df['total_run'].unique()

array([0, 1, 4, 6, 3, 2, 5, 7], dtype=int64)

Preprocessing the data such that there is no NaN values


In [7]:
df['delivery_type'] = np.where(df['isWicketDelivery'] == 1, '8', df['total_run'].astype(str))
df.drop(['total_run', 'isWicketDelivery'], axis=1, inplace=True)

In [8]:
df['delivery_type'].unique()

array(['0', '1', '4', '6', '3', '2', '8', '5', '7'], dtype=object)

In [9]:
df = df.head(10000)

In [10]:
# Categorical columns to be one-hot encoded
categorical_columns = ['batter', 'bowler', 'BattingTeam', 'BowlingTeam', 'Venue', 'TossWinner', 'TossDecision', 'delivery_type']

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on the training data
encoder.fit(df[categorical_columns])

# Transform the specified categorical columns to one-hot encoded representation
one_hot_encoded = encoder.transform(df[categorical_columns])

# Concatenate the one-hot encoded features with the original DataFrame
df = pd.concat([df, pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))], axis=1)

# Drop the original categorical columns
df = df.drop(categorical_columns, axis=1)


In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [12]:
df.head(5)

Unnamed: 0,innings,overs,ballnumber,Date,Team1,batter_matches_played,runs_scored,dismissals,balls_faced,0s_scored,1s_scored,2s_scored,4s_scored,6s_scored,high_score,25_scored,50_scored,75_scored,100_scored,strike_rate_x,batting_average,notout,explosivity_rating,0_wickets_taken,1_wickets_taken,2_wickets_taken,3_wickets_taken,4_wickets_taken,5_wickets_taken,6_wickets_taken,bowler_matches_played,runs_conceded,extras_runs_conceded,wickets_taken,balls_bowled,4s_conceded,6s_conceded,0s_conceded,1s_conceded,2s_conceded,highest_conceded,strike_rate_y,bowling_average,economy,total_runs_conceded,target,current_score,balls_left,wickets_left,runs_left,batter_A Badoni,batter_A Manohar,batter_A Nortje,batter_A Tomar,batter_AD Russell,batter_AJ Finch,batter_AK Markram,batter_AM Rahane,batter_AR Patel,batter_AS Joseph,batter_AS Roy,batter_AT Rayudu,batter_Abhishek Sharma,batter_Anuj Rawat,batter_Arshdeep Singh,batter_Avesh Khan,batter_B Indrajith,batter_B Kumar,batter_B Sai Sudharsan,batter_D Brevis,batter_D Padikkal,batter_D Pretorius,batter_DA Miller,batter_DA Warner,batter_DJ Bravo,batter_DJ Hooda,batter_DJ Mitchell,batter_DP Conway,batter_DR Sams,batter_E Lewis,batter_F du Plessis,batter_Fazalhaq Farooqi,batter_GJ Maxwell,batter_HE van der Dussen,batter_HH Pandya,batter_HR Shokeen,batter_HV Patel,batter_Harpreet Brar,batter_Harshit Rana,batter_Ishan Kishan,batter_J Suchith,batter_JC Buttler,batter_JD Unadkat,batter_JDS Neesham,batter_JJ Bumrah,batter_JM Bairstow,batter_JM Sharma,batter_JO Holder,batter_JR Hazlewood,batter_K Kartikeya,batter_K Rabada,batter_KA Pollard,batter_KD Karthik,batter_KH Pandya,batter_KK Ahmed,batter_KK Nair,batter_KL Rahul,batter_KS Bharat,batter_KS Sharma,batter_KS Williamson,batter_Kartik Tyagi,batter_Kuldeep Yadav,batter_LH Ferguson,batter_LS Livingstone,batter_Lalit Yadav,batter_M Ashwin,batter_M Jansen,batter_M Prasidh Krishna,batter_M Shahrukh Khan,batter_M Theekshana,batter_M Vohra,batter_MA Agarwal,batter_MJ Santner,batter_MK Lomror,batter_MK Pandey,batter_MM Ali,batter_MP Stoinis,batter_MR Marsh,batter_MS Dhoni,batter_MS Wade,batter_Mandeep Singh,batter_Mohammed Siraj,batter_Mohsin Khan,batter_Mukesh Choudhary,batter_N Jagadeesan,batter_N Pooran,batter_N Rana,batter_OC McCoy,batter_PBB Rajapaksa,batter_PJ Cummins,batter_PJ Sangwan,batter_PK Garg,batter_PN Mankad,batter_PP Shaw,batter_PVD Chameera,batter_PWH de Silva,batter_Q de Kock,batter_R Ashwin,batter_R Dhawan,batter_R Parag,batter_R Powell,batter_R Sanjay Yadav,batter_R Shepherd,batter_R Tewatia,batter_RA Jadeja,batter_RA Tripathi,batter_RD Chahar,batter_RD Gaikwad,batter_RG Sharma,batter_RK Singh,batter_RM Patidar,batter_RP Meredith,batter_RR Pant,batter_RV Patel,batter_RV Uthappa,batter_Ramandeep Singh,batter_Rashid Khan,batter_S Dhawan,batter_S Dube,batter_S Gopal,batter_SA Abbott,batter_SA Yadav,batter_SN Khan,batter_SN Thakur,batter_SO Hetmyer,batter_SP Jackson,batter_SP Narine,batter_SS Iyer,batter_SS Prabhudessai,batter_SV Samson,batter_SW Billings,batter_Shahbaz Ahmed,batter_Shashank Singh,batter_Shivam Mavi,batter_Shubman Gill,batter_Simarjeet Singh,batter_T Stubbs,batter_TA Boult,batter_TG Southee,batter_TH David,batter_Tilak Varma,batter_UT Yadav,batter_Umran Malik,batter_V Kohli,batter_VR Iyer,batter_WP Saha,batter_Washington Sundar,batter_YBK Jaiswal,batter_Yash Dayal,bowler_A Badoni,bowler_A Nortje,bowler_AD Russell,bowler_AK Markram,bowler_AR Patel,bowler_AS Joseph,bowler_AS Roy,bowler_Abhishek Sharma,bowler_Arshdeep Singh,bowler_Avesh Khan,bowler_B Kumar,bowler_C Sakariya,bowler_CV Varun,bowler_D Pretorius,bowler_DJ Bravo,bowler_DJ Mitchell,bowler_DR Sams,bowler_Fazalhaq Farooqi,bowler_GJ Maxwell,bowler_HH Pandya,bowler_HR Shokeen,bowler_HV Patel,bowler_Harpreet Brar,bowler_Harshit Rana,bowler_J Suchith,bowler_JD Unadkat,bowler_JJ Bumrah,bowler_JO Holder,bowler_JR Hazlewood,bowler_K Gowtham,bowler_K Kartikeya,bowler_K Rabada,bowler_KA Pollard,bowler_KH Pandya,bowler_KK Ahmed,bowler_KR Sen,bowler_Kartik Tyagi,bowler_Kuldeep Yadav,bowler_LH Ferguson,bowler_LS Livingstone,bowler_Lalit Yadav,bowler_M Ashwin,bowler_M Jansen,bowler_M Markande,bowler_M Pathirana,bowler_M Prasidh Krishna,bowler_M Theekshana,bowler_MJ Santner,bowler_MK Lomror,bowler_MM Ali,bowler_MP Stoinis,bowler_MR Marsh,bowler_Mohammed Shami,bowler_Mohammed Siraj,bowler_Mohsin Khan,bowler_Mukesh Choudhary,bowler_Mustafizur Rahman,bowler_N Rana,bowler_NT Ellis,bowler_OC McCoy,bowler_PH Solanki,bowler_PJ Cummins,bowler_PJ Sangwan,bowler_PVD Chameera,bowler_PWH de Silva,bowler_R Ashwin,bowler_R Dhawan,bowler_R Parag,bowler_R Sai Kishore,bowler_R Sanjay Yadav,bowler_R Shepherd,bowler_R Tewatia,bowler_RA Jadeja,bowler_RD Chahar,bowler_RP Meredith,bowler_Ramandeep Singh,bowler_Rashid Khan,bowler_Ravi Bishnoi,bowler_S Gopal,bowler_S Kaul,bowler_SA Abbott,bowler_SN Thakur,bowler_SP Narine,bowler_SS Iyer,bowler_Sandeep Sharma,bowler_Shahbaz Ahmed,bowler_Shashank Singh,bowler_Shivam Mavi,bowler_Simarjeet Singh,bowler_T Natarajan,bowler_TA Boult,bowler_TG Southee,bowler_UT Yadav,bowler_Umran Malik,bowler_VR Iyer,bowler_Washington Sundar,bowler_YS Chahal,bowler_Yash Dayal,BattingTeam_Chennai Super Kings,BattingTeam_Delhi Capitals,BattingTeam_Gujarat Titans,BattingTeam_Kolkata Knight Riders,BattingTeam_Lucknow Super Giants,BattingTeam_Mumbai Indians,BattingTeam_Punjab Kings,BattingTeam_Rajasthan Royals,BattingTeam_Royal Challengers Bangalore,BattingTeam_Sunrisers Hyderabad,BowlingTeam_Chennai Super Kings,BowlingTeam_Delhi Capitals,BowlingTeam_Gujarat Titans,BowlingTeam_Kolkata Knight Riders,BowlingTeam_Lucknow Super Giants,BowlingTeam_Mumbai Indians,BowlingTeam_Punjab Kings,BowlingTeam_Rajasthan Royals,BowlingTeam_Royal Challengers Bangalore,BowlingTeam_Sunrisers Hyderabad,"Venue_Brabourne Stadium, Mumbai","Venue_Dr DY Patil Sports Academy, Mumbai","Venue_Eden Gardens, Kolkata","Venue_Maharashtra Cricket Association Stadium, Pune","Venue_Narendra Modi Stadium, Ahmedabad","Venue_Wankhede Stadium, Mumbai",TossWinner_Chennai Super Kings,TossWinner_Delhi Capitals,TossWinner_Gujarat Titans,TossWinner_Kolkata Knight Riders,TossWinner_Lucknow Super Giants,TossWinner_Mumbai Indians,TossWinner_Punjab Kings,TossWinner_Rajasthan Royals,TossWinner_Royal Challengers Bangalore,TossWinner_Sunrisers Hyderabad,TossDecision_bat,TossDecision_field,delivery_type_0,delivery_type_1,delivery_type_2,delivery_type_3,delivery_type_4,delivery_type_5,delivery_type_6,delivery_type_7,delivery_type_8
0,1,0,1,2022-05-29,Rajasthan Royals,23,547,23,406,177.0,133.0,14.0,62.0,22.0,68,6.0,3.0,0.0,0.0,134.729064,23.782609,0,0.21,21.0,35.0,23.0,10.0,1.0,0.0,0.0,90,2738,164,115,2014,299.0,103.0,897.0,653.0,120.0,52,17.513043,23.808696,8.156902,2902,131,0,119,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,2,2022-05-29,Rajasthan Royals,23,547,23,406,177.0,133.0,14.0,62.0,22.0,68,6.0,3.0,0.0,0.0,134.729064,23.782609,0,0.21,21.0,35.0,23.0,10.0,1.0,0.0,0.0,90,2738,164,115,2014,299.0,103.0,897.0,653.0,120.0,52,17.513043,23.808696,8.156902,2902,131,1,118,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,3,2022-05-29,Rajasthan Royals,79,2736,69,1849,761.0,615.0,130.0,271.0,126.0,124,19.0,14.0,6.0,5.0,147.971877,39.652174,10,0.21,21.0,35.0,23.0,10.0,1.0,0.0,0.0,90,2738,164,115,2014,299.0,103.0,897.0,653.0,120.0,52,17.513043,23.808696,8.156902,2902,131,2,117,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,4,2022-05-29,Rajasthan Royals,23,547,23,406,177.0,133.0,14.0,62.0,22.0,68,6.0,3.0,0.0,0.0,134.729064,23.782609,0,0.21,21.0,35.0,23.0,10.0,1.0,0.0,0.0,90,2738,164,115,2014,299.0,103.0,897.0,653.0,120.0,52,17.513043,23.808696,8.156902,2902,131,2,116,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,5,2022-05-29,Rajasthan Royals,23,547,23,406,177.0,133.0,14.0,62.0,22.0,68,6.0,3.0,0.0,0.0,134.729064,23.782609,0,0.21,21.0,35.0,23.0,10.0,1.0,0.0,0.0,90,2738,164,115,2014,299.0,103.0,897.0,653.0,120.0,52,17.513043,23.808696,8.156902,2902,131,2,115,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
import pandas as pd

sequence_length = 6


def create_sequences(group):
    sequences = []
    for inning in group['innings'].unique():
        inning_data = group[group['innings'] == inning]
        
        for i in range(len(inning_data) - sequence_length + 1):
            sequence = inning_data.iloc[i:i + sequence_length].copy()
            
            # Drop unnecessary columns for the sequence (modify as needed)
            sequence = sequence.drop(['Date', 'Team1'], axis=1)
            
            # Drop 'delivery_type' from the sequence if it is present
            if 'delivery_type' in sequence.columns:
                sequence = sequence.drop(['delivery_type'], axis=1)
            
            sequences.append(sequence)
    return sequences

# Group by the relevant columns (including one-hot encoded columns)
grouped_df = df.groupby(['Date', 'Team1']).apply(create_sequences)
sequences = [item for sublist in grouped_df for item in sublist]


In [14]:
(df.isna().sum()==1).sum()

0

In [15]:
# Get a list of all unique columns across all sequences
all_columns = set()
for sequence in sequences:
    all_columns.update(sequence.columns)

# Ensure all sequences have the same columns
for i, sequence in enumerate(sequences):
    missing_columns = list(all_columns - set(sequence.columns))
    if missing_columns:
        # Add missing columns with NaN values
        sequences[i] = pd.concat([sequence, pd.DataFrame(columns=missing_columns)], axis=1)

# Convert sequences to a 3D NumPy array
data = np.array([sequence[list(all_columns)].values for sequence in sequences])

# Extract the target one-hot encoded values for each sequence from the last columns
target_columns = [f'delivery_type_{i}' for i in range(9)]
target = np.array([sequence[target_columns].values[-1] for sequence in sequences])


In [16]:
data[3], target[3]

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [13]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

inf_indices = np.where(np.isinf(X_train))

inf_rows = inf_indices[0]
inf_cols = inf_indices[1]

# Remove rows with infinity values
X_train_cleaned = np.delete(X_train, inf_rows, axis=0)
y_train_cleaned = np.delete(y_train, inf_rows, axis=0)

inf_indices = np.where(np.isinf(X_test))

inf_rows = inf_indices[0]
inf_cols = inf_indices[1]

# Remove rows with infinity values
X_test_cleaned = np.delete(X_test, inf_rows, axis=0)
y_test_cleaned = np.delete(y_test, inf_rows, axis=0)






class CricketDataset(torch.utils.data.Dataset):
    def __init__(self, sequences, targets):
        # Convert sequences to a compatible numeric type (float32)
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Create DataLoader instances for training and testing
batch_size = 120
train_dataset = CricketDataset(X_train_cleaned, y_train_cleaned)
test_dataset = CricketDataset(X_test_cleaned, y_test_cleaned)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [14]:
for x, y in train_loader:
    print(len(x), len(y))
    break

120 120


In [15]:
# import torch
# import torch.nn.functional as F


# class MyLSTM(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(MyLSTM, self).__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x):
#         out, _ = self.lstm(x)
#         out = self.fc(out[:, -1, :])  # Extract the output of the last time step
#         return out
        
# class MyLSTMWithReLU(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(MyLSTMWithReLU, self).__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
#         self.relu = nn.ReLU()
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x):
#         out, _ = self.lstm(x)
#         out = self.relu(out)  # Apply ReLU activation
#         out = self.fc(out[:, -1, :])  # Extract the output of the last time step
#         return out

# import torch
# import torch.nn as nn

# class MyGRU(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(MyGRU, self).__init__()
#         self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x):
#         out, _ = self.gru(x)
#         out = self.fc(out[:, -1, :])  # Extract the output of the last time step
#         return out

# class MyGRUWithReLU(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(MyGRUWithReLU, self).__init__()
#         self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
#         self.relu = nn.ReLU()
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x):
#         out, _ = self.gru(x)
#         out = self.relu(out)  # Apply ReLU activation
#         out = self.fc(out[:, -1, :])  # Extract the output of the last time step
#         return out
    
class MyLSTMWithSoftmax(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyLSTMWithSoftmax, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.relu(out)  # Apply ReLU activation
        out = self.fc(out[:, -1, :])  # Extract the output of the last time step
        out = F.softmax(out, dim=1)  # Apply softmax activation
        return out


In [16]:

# Define model dimensions
input_size = data.shape[2]  
hidden_size = 64  
output_size = 9

# Create the model
# model = MyLSTM(input_size, hidden_size, output_size)
# model = MyLSTMWithReLU(input_size, hidden_size, output_size)
# model = MyGRUWithReLU(input_size, hidden_size, output_size)
# model = MyGRU(input_size, hidden_size, output_size)
model = MyLSTMWithSoftmax(input_size, hidden_size, output_size)


# Define the loss function
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [18]:
num_epochs = 10
for inputs, labels in train_loader:
    print(inputs[0], labels[0])
    break

for epoch in range(num_epochs):
    model.train()

# Inside the training loop
    for inputs, labels in train_loader: 
    
        # Assuming 'inputs' and 'labels' are torch Tensors
        outputs = model(inputs)
        
        # Convert labels to long data type
        labels = labels.float()

    
        # Compute the loss
        loss = criterion(outputs, labels)
    
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()

        #uopdate weights
        optimizer.step()


    # Print the loss at the end of each epoch (optional)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) tensor([0., 0., 1., 0., 0., 0., 0., 0., 0.])
Epoch 1/10, Loss: 2.1898744106292725
Epoch 2/10, Loss: 2.1768150329589844
Epoch 3/10, Loss: 2.1615238189697266
Epoch 4/10, Loss: 2.1442677974700928
Epoch 5/10, Loss: 2.1223502159118652
Epoch 6/10, Loss: 2.1084964275360107
Epoch 7/10, Loss: 2.0940213203430176
Epoch 8/10, Loss: 2.0814359188079834
Epoch 9/10, Loss: 2.069685935974121
Epoch 10/10, Loss: 2.0583274364471436


In [20]:
model.eval()

correct_predictions = 0
total_samples = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        predictions = model(inputs)
        
        # Assuming binary classification
        predicted_labels = torch.round(predictions)
        
        # Count correct predictions
        correct_predictions += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)
        print(inputs[0], np.argmax(predictions[0]), labels[0])

# Calculate accuracy
accuracy = correct_predictions / total_samples
print(f'Accuracy: {accuracy}')


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) tensor(1) tensor([0., 0., 0., 0., 0., 0., 0., 0., 1.])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) tensor(1) tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) tensor(1) tensor([0., 1., 0., 0., 0., 0., 0., 0., 0.])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0

In [None]:
model.eval()

correct_predictions = 0
total_samples = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        predictions = model(inputs)
        
        # Assuming binary classification
        predicted_labels = torch.round(predictions)
        
        # Count correct predictions
        correct_predictions += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)
        for elem in predictions:
            if np.argmax(elem).item() > 1:
                print("well done")
                break

# Calculate accuracy
accuracy = correct_predictions / total_samples
print(f'Accuracy: {accuracy}')


In [None]:
import torch

# Assuming 'output' is your tensor or array
output = torch.tensor([0.5, 0.2, 0.8, 0.3])

# Find the index of the maximum value
max_index = torch.argmax(output)

print("Index of the maximum value:", max_index.item())
