<a href="https://colab.research.google.com/github/lucrj/datascience/blob/main/iplscore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset
# Make sure 'data.csv' is in the same directory as your script
df = pd.read_csv('/content/data.csv')

# --- Data Cleaning ---

# Remove unwanted columns that are not relevant for score prediction
df.drop(['mid', 'batsman', 'bowler', 'striker', 'non-striker'], axis=1, inplace=True)

# For consistency, we'll keep only the teams that are currently playing or have played for a long time
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']
df = df[(df['batting_team'].isin(consistent_teams)) & (df['bowling_team'].isin(consistent_teams))]

# Remove the first 5 overs of every match, as score prediction is more reliable after a few overs
df = df[df['overs'] >= 5.0]

# --- Data Preprocessing ---

# Convert the 'date' column to datetime objects to handle time-series data if needed in future analysis
df['date'] = pd.to_datetime(df['date'])

# Convert categorical features into numerical format using one-hot encoding
# This is necessary for the machine learning model to understand them
encoded_df = pd.get_dummies(data=df, columns=['batting_team', 'bowling_team', 'venue'])

# Split the data into features (X) and the target variable (y)
X = encoded_df.drop('total', axis=1)
y = encoded_df['total']

# We don't need the date column for our current prediction model
X = X.drop('date', axis=1)

# Split the data into training and testing sets to evaluate the model's performance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Model Building ---

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model with the training data
model.fit(X_train, y_train)

# --- Model Evaluation ---

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# --- Create a sample prediction function ---

def predict_score(batting_team, bowling_team, venue, overs, runs, wickets, runs_last_5, wickets_last_5, model, X_train_columns):
    """
    Predicts the final score of a team based on the current match situation.
    """
    # Create a dictionary with the input data
    input_data = {
        'batting_team': [batting_team],
        'bowling_team': [bowling_team],
        'venue': [venue],
        'overs': [overs],
        'runs': [runs],
        'wickets': [wickets],
        'runs_last_5': [runs_last_5],
        'wickets_last_5': [wickets_last_5]
    }

    # Create a DataFrame from the input data
    input_df = pd.DataFrame(input_data)

    # One-hot encode the categorical features
    input_encoded = pd.get_dummies(input_df, columns=['batting_team', 'bowling_team', 'venue'])

    # Align the columns of the input with the training data to ensure they have the same features
    input_aligned = input_encoded.reindex(columns=X_train_columns, fill_value=0)

    # Predict the score
    predicted_score = model.predict(input_aligned)

    return predicted_score[0]

# Example usage of the prediction function
predicted_total = predict_score(batting_team='Mumbai Indians',
                                 bowling_team='Kolkata Knight Riders',
                                 venue='M Chinnaswamy Stadium',
                                 overs=10.2,
                                 runs=80,
                                 wickets=2,
                                 runs_last_5=35,
                                 wickets_last_5=1,
                                 model=model,
                                 X_train_columns = X_train.columns)

print(f"Predicted Score: {predicted_total}")

Mean Absolute Error: 12.296815622921317
R-squared: 0.7307869817777806
Predicted Score: 168.52876230525035
