In [1]:
import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("../derived/final_stats_data.csv",low_memory=False)

In [3]:
df.columns

Index(['ID', 'innings', 'overs', 'ballnumber', 'batter', 'bowler',
       'non-striker', 'extra_type', 'batsman_run', 'extras_run', 'total_run',
       'non_boundary', 'isWicketDelivery', 'player_out', 'kind',
       'fielders_involved', 'BattingTeam', 'City', 'Date', 'Season',
       'MatchNumber', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision',
       'SuperOver', 'WinningTeam', 'WonBy', 'Margin', 'method',
       'Player_of_Match', 'Team1Players', 'Team2Players', 'Umpire1', 'Umpire2',
       'BowlingTeam', 'batter_matches_played', 'runs_scored', 'dismissals',
       'balls_faced', '0s_scored', '1s_scored', '2s_scored', '4s_scored',
       '6s_scored', 'high_score', '25_scored', '50_scored', '75_scored',
       '100_scored', 'strike_rate_x', 'batting_average', 'notout',
       'explosivity_rating', '0_wickets_taken', '1_wickets_taken',
       '2_wickets_taken', '3_wickets_taken', '4_wickets_taken',
       '5_wickets_taken', '6_wickets_taken', 'bowler_matches_played',
       

In [4]:
from sklearn.preprocessing import OneHotEncoder

# Assuming 'df' is your DataFrame
columns_to_encode = ['Season', 'BattingTeam', 'BowlingTeam']

# Create the OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)

# Fit and transform the specified columns
encoded_columns = encoder.fit_transform(df[columns_to_encode])

# Create a DataFrame with the encoded columns
df_encoded = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(columns_to_encode))

# Concatenate the original DataFrame and the new encoded DataFrame
df = pd.concat([df, df_encoded], axis=1)

# Drop the original columns that were encoded
df.drop(columns=columns_to_encode, inplace=True)

# Now, 'df' contains the original data with the One-Hot Encoded columns




In [5]:
df.columns

Index(['ID', 'innings', 'overs', 'ballnumber', 'batter', 'bowler',
       'non-striker', 'extra_type', 'batsman_run', 'extras_run',
       ...
       'BowlingTeam_Kochi Tuskers Kerala', 'BowlingTeam_Kolkata Knight Riders',
       'BowlingTeam_Lucknow Super Giants', 'BowlingTeam_Mumbai Indians',
       'BowlingTeam_Pune Warriors', 'BowlingTeam_Punjab Kings',
       'BowlingTeam_Rajasthan Royals', 'BowlingTeam_Rising Pune Supergiant',
       'BowlingTeam_Royal Challengers Bangalore',
       'BowlingTeam_Sunrisers Hyderabad'],
      dtype='object', length=121)

Scaling


In [13]:
from sklearn.preprocessing import MinMaxScaler

# Specify the columns to scale
columns_to_scale = ['strike_rate_x', 'batting_average', 'strike_rate_y', 'bowling_average', 'economy']

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the selected columns
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


In [11]:
print(df[columns_to_scale].max())
print(df[columns_to_scale].min())
print(df[columns_to_scale].isnull().sum())


strike_rate_x      400.0
batting_average      inf
strike_rate_y        inf
bowling_average      inf
economy             36.0
dtype: float64
strike_rate_x      0.0
batting_average    0.0
strike_rate_y      1.0
bowling_average    0.0
economy            0.0
dtype: float64
strike_rate_x      0
batting_average    0
strike_rate_y      0
bowling_average    0
economy            0
dtype: int64


In [12]:
# Replace infinite or too large values with the median
df[columns_to_scale] = df[columns_to_scale].replace([np.inf, -np.inf], np.nan)
df[columns_to_scale] = df[columns_to_scale].fillna(df[columns_to_scale].median())


split dataset


In [14]:
from sklearn.model_selection import train_test_split

# Assuming df is your preprocessed DataFrame
unique_matches = df['ID'].unique()
train_matches, test_matches = train_test_split(unique_matches, test_size=0.2, random_state=42)

train_data = df[df['ID'].isin(train_matches)]
test_data = df[df['ID'].isin(test_matches)]


In [15]:
import numpy as np

# Find the intersection of unique match IDs
intersection_ids = np.intersect1d(train_data['ID'].unique(), test_data['ID'].unique())

# Print the intersection
print("Intersection of Match IDs:", intersection_ids)


Intersection of Match IDs: []


sequence creation


In [37]:

# Create a list to store sequences
sequences = []

# Iterate over each match
for match_id, match_group in train_data.groupby('ID'):
    # Iterate over each innings
    for inning_id, inning_group in match_group.groupby('innings'):
        # Extract the relevant features for the sequence
        features = inning_group[['innings', 'overs', 'ballnumber','strike_rate_x', 'batting_average', 'strike_rate_y', 'bowling_average',
                                 'economy', 'current_score', 'balls_left', 'wickets_left', 'runs_left','total_run','isWicketDelivery']]

        # Create dynamic sequences based on the actual number of balls in each over
        for i in range(len(inning_group)):
            sequence = features.iloc[:i + 1]  # Adjust the sequence length dynamically
            sequences.append(sequence)

# Convert the list of sequences to a DataFrame
sequences_df = pd.concat(sequences, ignore_index=True)

sequences_df.head(10)



Unnamed: 0,innings,overs,ballnumber,strike_rate_x,batting_average,strike_rate_y,bowling_average,economy,current_score,balls_left,wickets_left,runs_left,total_run,isWicketDelivery
0,1,0,1,0.268327,0.288808,0.286548,0.233182,0.204627,1,119,10,0,1,0
1,1,0,1,0.268327,0.288808,0.286548,0.233182,0.204627,1,119,10,0,1,0
2,1,0,2,0.328585,0.307474,0.286548,0.233182,0.204627,1,118,10,0,0,0
3,1,0,1,0.268327,0.288808,0.286548,0.233182,0.204627,1,119,10,0,1,0
4,1,0,2,0.328585,0.307474,0.286548,0.233182,0.204627,1,118,10,0,0,0
5,1,0,3,0.328585,0.307474,0.286548,0.233182,0.204627,2,117,10,0,1,0
6,1,0,1,0.268327,0.288808,0.286548,0.233182,0.204627,1,119,10,0,1,0
7,1,0,2,0.328585,0.307474,0.286548,0.233182,0.204627,1,118,10,0,0,0
8,1,0,3,0.328585,0.307474,0.286548,0.233182,0.204627,2,117,10,0,1,0
9,1,0,4,0.328585,0.307474,0.286548,0.233182,0.204627,2,116,10,0,0,0


In [39]:
sequences_df.shape

(10835357, 14)

In [38]:
seq_df = sequences_df.copy()

In [40]:
seq_df.shape

(10835357, 14)

loader somethign


In [41]:
import torch
from torch.utils.data import Dataset, DataLoader

class CricketDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

# Assuming seq_df contains your sequences and you have corresponding targets
targets_df = seq_df[['total_run', 'isWicketDelivery']]
seq_df.drop(['total_run', 'isWicketDelivery'], axis=1, inplace=True)

# Convert seq_df and targets_df to numpy arrays
sequences_np = seq_df.to_numpy()
targets_np = targets_df.to_numpy()

# Create DataLoader instances
batch_size = 32
train_dataset = CricketDataset(sequences_np, targets_np)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [53]:
sequences_np.shape,targets_np.shape

((10835357, 12), (10835357, 2))

model


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim


In [56]:
class CricketLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(CricketLSTM, self).__init__()

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer for total_run prediction
        self.fc_total_run = nn.Linear(hidden_size, output_size['total_run'])

        # Fully connected layer for isWicketDelivery prediction
        self.fc_is_wicket = nn.Linear(hidden_size, output_size['isWicketDelivery'])

    def forward(self, x):
        # Forward pass through LSTM layer
        out, _ = self.lstm(x)

        # Only take the output from the final time step
        out = out[:, -1, :]

        # Predict total_run
        total_run_pred = self.fc_total_run(out)

        # Predict isWicketDelivery
        is_wicket_pred = self.fc_is_wicket(out)
        print("total_run_pred Shape:", total_run_pred.shape)
        print("is_wicket_pred Shape:", is_wicket_pred.shape)


        # Enforce that when isWicketDelivery is 1, total_run must be equal to zero
        total_run_pred = total_run_pred * (1 - is_wicket_pred)
        print("after total_run_pred Shape:", total_run_pred.shape)
        print("afdter is_wicket_pred Shape:", is_wicket_pred.shape)


        return total_run_pred, is_wicket_pred

In [46]:
class CricketLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(CricketLSTM, self).__init__()

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layers for total_run and isWicketDelivery
        self.fc_total_run = nn.Linear(hidden_size, 1)
        self.fc_is_wicket = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # Forward pass through LSTM layer
        out, _ = self.lstm(x)

        # Only take the output from the final time step
        out = out[:, -1, :]

        # Separate fully connected layers for total_run and isWicketDelivery
        out_total_run = self.fc_total_run(out)
        out_is_wicket = self.fc_is_wicket(out)

        # Concatenate the predictions
        predictions = torch.cat([out_total_run, out_is_wicket], dim=1)

        return predictions

In [57]:
input_size =  12  # Define input size based on your features
hidden_size =  60  # Define hidden size
output_size = {'total_run': 1, 'isWicketDelivery': 1}  # Define output size

# Instantiate the model
model = CricketLSTM(input_size, hidden_size, output_size)

# Print the model architecture
print(model)

CricketLSTM(
  (lstm): LSTM(12, 60, batch_first=True)
  (fc_total_run): Linear(in_features=60, out_features=1, bias=True)
  (fc_is_wicket): Linear(in_features=60, out_features=1, bias=True)
)


loss function


In [29]:
import torch.nn.functional as F

def custom_loss3(predictions, targets, match_ids):
    total_run_pred = predictions[:, 0]  # Assuming the first column is for total_run
    is_wicket_pred = predictions[:, 1]  # Assuming the second column is for isWicketDelivery

    total_run_actual = targets[:, 0]
    is_wicket_actual = targets[:, 1]

    # Mean Squared Error (MSE) loss for total_run
    total_run_loss = F.mse_loss(total_run_pred, total_run_actual)

    # Binary Cross Entropy loss for isWicketDelivery
    is_wicket_loss = F.binary_cross_entropy_with_logits(is_wicket_pred, is_wicket_actual)

    # Regularization term
    regularization_term = torch.abs(total_run_pred.sum(dim=0) - total_run_actual.sum(dim=0)).mean()

    # Combined loss
    combined_loss = total_run_loss + is_wicket_loss + regularization_term

    return combined_loss


In [48]:
def custom_loss2(predictions, targets):
    total_run_pred = predictions[:, 0]  # Assuming the first column is for total_run
    is_wicket_pred = predictions[:, 1]  # Assuming the second column is for isWicketDelivery

    total_run_actual = targets[:, 0]
    is_wicket_actual = targets[:, 1]

    # Mean Squared Error (MSE) loss for total_run
    total_run_loss = F.mse_loss(total_run_pred, total_run_actual)

    # Binary Cross Entropy loss for isWicketDelivery
    is_wicket_loss = F.binary_cross_entropy_with_logits(is_wicket_pred, is_wicket_actual)

    # Regularization term
    regularization_term = torch.abs(total_run_pred.sum() - total_run_actual.sum())

    # Combined loss
    combined_loss = total_run_loss + is_wicket_loss + regularization_term

    return combined_loss


In [26]:
import torch.nn.functional as F

def custom_loss(predictions, targets, match_ids, innings):
    total_run_pred = predictions[:, 0]  # Assuming the first column is for total_run
    is_wicket_pred = predictions[:, 1]  # Assuming the second column is for isWicketDelivery

    total_run_actual = targets[:, 0]
    is_wicket_actual = targets[:, 1]

    # Mean Squared Error (MSE) loss for total_run
    total_run_loss = F.mse_loss(total_run_pred, total_run_actual)

    # Binary Cross Entropy loss for isWicketDelivery
    is_wicket_loss = F.binary_cross_entropy_with_logits(is_wicket_pred, is_wicket_actual)

    # Regularization term for each inning within a match
    regularization_term = 0.0
    unique_matches = match_ids.unique()
    
    for match_id in unique_matches:
        match_indices = (match_ids == match_id).nonzero().view(-1)
        innings_in_match = innings[match_indices]
        total_run_pred_match = total_run_pred[match_indices]
        
        # Calculate the regularization term for each inning
        for inning in innings_in_match.unique():
            inning_indices = (innings_in_match == inning).nonzero().view(-1)
            regularization_term += torch.abs(total_run_pred_match[inning_indices].sum(dim=0) - total_run_actual[match_indices][inning_indices].sum(dim=0)).mean()

    # Normalize the regularization term by the number of innings
    regularization_term /= len(unique_matches)

    # Combined loss
    combined_loss = total_run_loss + is_wicket_loss + regularization_term

    return combined_loss


optimizer


In [49]:
import torch.optim as optim
learning_rate = 0.001  # Adjust this value as needed

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

training loop


In [None]:
for batch_sequences, batch_targets in train_loader:
    print("Batch Sequences Shape:", batch_sequences.shape)
    print("Batch Targets Shape:", batch_targets.shape)



In [58]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0.0

    for batch_sequences, batch_targets in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        predictions = model(batch_sequences)

        # Calculate the loss
        loss = custom_loss2(predictions, batch_targets)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        total_loss += loss.item()

    # Print the average loss for the epoch
    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}')

IndexError: too many indices for tensor of dimension 2