In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
file_path = 'deliveries.csv'  
data = pd.read_csv(file_path)

In [4]:
print(data.head())

   match_id  inning           batting_team                 bowling_team  over  \
0    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
1    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
2    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
3    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
4    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   

   ball       batter   bowler  non_striker  batsman_runs  extra_runs  \
0     1   SC Ganguly  P Kumar  BB McCullum             0           1   
1     2  BB McCullum  P Kumar   SC Ganguly             0           0   
2     3  BB McCullum  P Kumar   SC Ganguly             0           1   
3     4  BB McCullum  P Kumar   SC Ganguly             0           0   
4     5  BB McCullum  P Kumar   SC Ganguly             0           0   

   total_runs extras_type  is_wicket player_dismissed dismissal_kind fielder  
0

In [5]:
print(data.isnull().sum())

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batter                   0
bowler                   0
non_striker              0
batsman_runs             0
extra_runs               0
total_runs               0
extras_type         246795
is_wicket                0
player_dismissed    247970
dismissal_kind      247970
fielder             251566
dtype: int64


In [6]:
data['dismissal_kind'].fillna('Not Out', inplace=True)

In [7]:
le = LabelEncoder()
data['batting_team'] = le.fit_transform(data['batting_team'])
data['bowling_team'] = le.fit_transform(data['bowling_team'])
data['batter'] = le.fit_transform(data['batter'])
data['bowler'] = le.fit_transform(data['bowler'])
data['non_striker'] = le.fit_transform(data['non_striker'])
data['extras_type'] = le.fit_transform(data['extras_type'].astype(str))  # Convert to string to handle NaNs
data['dismissal_kind'] = le.fit_transform(data['dismissal_kind'])
data['fielder'] = le.fit_transform(data['fielder'].astype(str))  # Convert to string to handle NaNs

In [8]:
data['cumulative_runs'] = data.groupby(['match_id', 'inning', 'batting_team'])['batsman_runs'].cumsum()

In [9]:
data['runs_per_over'] = data.groupby(['match_id', 'inning', 'batting_team'])['total_runs'].cumsum() / data['over']

In [10]:
data['strike_rate'] = data['batsman_runs'].cumsum() / data['ball']

In [13]:
data = data.drop(columns=['fielder', 'player_dismissed', 'extras_type'])

KeyError: "['fielder', 'player_dismissed', 'extras_type'] not found in axis"

In [14]:
# Drop unnecessary columns (only if they exist)
# Modify or remove the columns based on your actual dataset
columns_to_drop = ['fielder', 'player_dismissed', 'extras_type']

# Check which columns are actually present in the DataFrame
existing_columns_to_drop = [col for col in columns_to_drop if col in data.columns]

# Drop the columns that exist
data = data.drop(columns=existing_columns_to_drop)

# Continue with the rest of your code


In [15]:
# Define features (X) and target (y)
X = data[['inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batter', 'bowler', 'non_striker', 'runs_per_over', 'strike_rate']]
y = data['cumulative_runs']  # The target is cumulative runs

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Model training using Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

ValueError: Input X contains infinity or a value too large for dtype('float32').

In [18]:
import numpy as np
import pandas as pd

# Check for NaN values in X_train
print("Number of NaN values in X_train:", np.isnan(X_train).sum())

# Check for infinite values in X_train
print("Number of infinite values in X_train:", np.isinf(X_train).sum())

# Replace infinite values with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# Option 1: Remove rows with NaN values (simple but may lose data)
X_train.dropna(inplace=True)
y_train = y_train[X_train.index]  # Keep the corresponding target values

# Option 2: Replace NaN values with the mean of the column (preserves data)
X_train.fillna(X_train.mean(), inplace=True)

# After handling missing values, try training the model again
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


Number of NaN values in X_train: inning              0
batting_team        0
bowling_team        0
over                0
ball                0
batter              0
bowler              0
non_striker         0
runs_per_over    2170
strike_rate         0
dtype: int64
Number of infinite values in X_train: inning              0
batting_team        0
bowling_team        0
over                0
ball                0
batter              0
bowler              0
non_striker         0
runs_per_over    8937
strike_rate         0
dtype: int64


In [19]:
y_pred = model.predict(X_test)

ValueError: Input X contains infinity or a value too large for dtype('float32').

In [None]:
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print(f"Mean Absolute Error: {mae}")

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel('Actual Cumulative Runs')
plt.ylabel('Predicted Cumulative Runs')
plt.title('Actual vs Predicted Cumulative Runs')
plt.show()

In [None]:
# Predict the score for a new match
# Example: Input data for a new match (change values according to your scenario)
new_match_data = {
    'inning': 1,
    'batting_team': le.transform(['Mumbai Indians'])[0],  # Replace with actual team
    'bowling_team': le.transform(['Chennai Super Kings'])[0],  # Replace with actual team
    'over': 15,
    'ball': 3,
    'batter': le.transform(['Virat Kohli'])[0],  # Replace with actual batter
    'bowler': le.transform(['Jasprit Bumrah'])[0],  # Replace with actual bowler
    'non_striker': le.transform(['AB de Villiers'])[0],  # Replace with actual non-striker
    'runs_per_over': 7.5,  # Replace with actual runs per over
    'strike_rate': 130.5  # Replace with actual strike rate
}

In [None]:
new_match_df = pd.DataFrame([new_match_data])

In [None]:
predicted_score = model.predict(new_match_df)
print(f"Predicted Cumulative Runs: {predicted_score[0]}")

In [2]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'score_predictor_model.pkl')

NameError: name 'model' is not defined

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib

# Load your dataset
data = pd.read_csv('your_dataset.csv')  # Update with your actual dataset

# Prepare your features and target variable
X = data.drop('target_column', axis=1)  # Replace 'target_column' with your actual target column
y = data['target_column']  # Replace 'target_column' with your actual target column

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train your model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, 'score_predictor_model.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

In [1]:
import joblib

# Assume `model` is your trained model variable
joblib.dump(model, 'score_predictor_model.pkl')

NameError: name 'model' is not defined

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import joblib

# Load example data
data = fetch_california_housing()
X = data.data  # Features
y = data.target  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'score_predictor_model.pkl')

print("Model saved as 'score_predictor_model.pkl'")


Model saved as 'score_predictor_model.pkl'
