In [1]:
import pandas as pd

# Load your dataset (replace 'your_dataset.csv' with your file path)
df = pd.read_csv('t.csv')

# Ensure data is sorted by match_id and ball_number
# Add match_id if not present; assuming one match for this example
df['match_id'] = 1  # Placeholder: Replace with actual match_id if available
df = df.sort_values(by=['match_id', 'Ball Number']).reset_index(drop=True)

In [2]:
# Replace N/A with 0 for runs (e.g., wides don't count towards batsman's runs)
df['Runs'] = pd.to_numeric(df['Runs'], errors='coerce').fillna(0)

# Calculate total runs scored, including wides as 1 run
df['is_extra'] = df['Runs'].isna()  # Identify wides/no-balls
df['runs_for_total'] = df['Runs'].where(df['Runs'].notna(), 1)  # Wides count as 1 run
df['total_runs'] = df.groupby('match_id')['runs_for_total'].cumsum()

In [3]:
# Extract over and ball within over
df['over'] = df['Ball Number'].apply(lambda x: int(x))
df['ball_in_over'] = df['Ball Number'].apply(lambda x: int(round((x - int(x)) * 10)))

# Total balls bowled (cumulative count per match)
df['balls_bowled'] = df.groupby('match_id').cumcount() + 1

# Overs bowled as decimal (e.g., 0.3 overs = 3/6 = 0.5)
df['overs_bowled'] = df['over'] + df['ball_in_over'] / 6

In [4]:
df['current_run_rate'] = df['total_runs'] / df['overs_bowled']

In [5]:
# Placeholder: Define match metadata manually
match_metadata = {
    1: {'total_overs': 50, 'target': 252},  # Example for match_id 1 (e.g., T20)
    # Add more matches: e.g., 2: {'total_overs': 50, 'target': 300}
}

# Map metadata to DataFrame
df['total_overs'] = df['match_id'].map(lambda x: match_metadata.get(x, {}).get('total_overs', 0))
df['target'] = df['match_id'].map(lambda x: match_metadata.get(x, {}).get('target', 0))

# Calculate runs needed and overs remaining
df['runs_needed'] = df['target'] - df['total_runs']
df['overs_remaining'] = df['total_overs'] - df['overs_bowled']

# Required run rate (handle division by zero)
df['required_run_rate'] = df.apply(
    lambda row: row['runs_needed'] / row['overs_remaining'] if row['overs_remaining'] > 0 else 0,
    axis=1
)

In [6]:
# Identify wickets
df['is_wicket'] = df['Runs'] == -1

# Cumulative wickets fallen per match
df['wickets_fallen'] = df.groupby('match_id')['is_wicket'].cumsum()

# Wickets in hand (assuming 10 wickets)
df['wickets_in_hand'] = 10 - df['wickets_fallen']

In [7]:
df.to_csv('enhanced_dataset.csv', index=False)

In [8]:
print(df)

     Ball  Ball Number    Bowler       Batsman  Runs  \
0       1          0.1  Jamieson  Rohit Sharma   0.0   
1       2          0.2  Jamieson  Rohit Sharma   6.0   
2       3          0.3  Jamieson  Rohit Sharma   2.0   
3       4          0.4  Jamieson  Rohit Sharma   0.0   
4       5          0.5  Jamieson  Rohit Sharma   0.0   
..    ...          ...       ...           ...   ...   
295   296         48.2  O'Rourke         Rahul   1.0   
296   297         48.3  O'Rourke        Jadeja   0.0   
297   298         48.4  O'Rourke        Jadeja   1.0   
298   299         48.5  O'Rourke         Rahul   1.0   
299   300         48.6  O'Rourke        Jadeja   4.0   

                                           Description  match_id  is_extra  \
0    Away swing, low bounce, beats the outside edge...         1     False   
1    Pulled behind square. Old-school Rohit, standi...         1     False   
2    Touch too straight, clipped off the pads. Thro...         1     False   
3           Len