In [60]:
import pandas as pd
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10)
pd.set_option('display.width', 1000)


In [137]:

def load_data(path):
    # Check if the file exists
    if not os.path.exists(path):
        print(f"Error: The file at path {path} does not exist.")
        return None

    # Try to load the CSV file
    try:
        df = pd.read_csv(path)
        print("Data loaded successfully.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Example usage
#file_path = os.path.join('data', 'raw', 'all_season_details.csv')
df = load_data("all_season_details.csv")
df.head(2)



Data loaded successfully.


Unnamed: 0,comment_id,season,match_id,match_name,home_team,away_team,current_innings,innings_id,over,ball,runs,isBoundary,isWide,isNoball,batsman1_name,batsman_runs,batsman_balls,bowler_name,bowler_overs,bowler_maidens,bowler_runs,bowler_wkts,wicket_id,wkt_batsman_name,wkt_bowler_name,wkt_batsman_runs,wkt_batsman_balls,wkt_text,isRetiredHurt,text,preText,postText
0,110,2023,1359475,GT v CSK,GT,CSK,CSK,1,1,1,0,False,False,False,Devon ...,0,1,Mohamm...,0.1,0,0,0,,,,,,,False,nice a...,<p><st...,
1,120,2023,1359475,GT v CSK,GT,CSK,CSK,1,1,2,1,False,False,False,Devon ...,0,2,Mohamm...,0.2,0,0,0,,,,,,,False,Conway...,,


In [62]:
df.columns

Index(['comment_id', 'season', 'match_id', 'match_name', 'home_team', 'away_team', 'current_innings', 'innings_id', 'over', 'ball', 'runs', 'isBoundary', 'isWide', 'isNoball', 'batsman1_name', 'batsman_runs', 'batsman_balls', 'bowler_name', 'bowler_overs', 'bowler_maidens', 'bowler_runs', 'bowler_wkts', 'wicket_id', 'wkt_batsman_name', 'wkt_bowler_name', 'wkt_batsman_runs', 'wkt_batsman_balls', 'wkt_text', 'isRetiredHurt', 'text', 'preText', 'postText'], dtype='object')

In [82]:
df.isBoundary.unique()

array([False,  True])

In [64]:
def clean_data(df):
        return(
                 df.drop("comment_id",axis=1)
                 .assign(
                        home_team=lambda df_:(
                                df_
                                .home_team
                                .str.replace("PWI","PBKS")
                                .str.replace('GL','GT')
                                .str.replace("KXIP","PBKS")  
                                ),
                        away_team=lambda df_:(
                                df_
                                .away_team
                                .str.replace("PWI","PBKS")
                                .str.replace('GL','GT')
                                .str.replace("KXIP","PBKS")  
                                ),
                        current_innings=lambda df_:(
                                df_
                                .current_innings
                                .str.replace("PWI","PBKS")
                                .str.replace('GL','GT')
                                .str.replace("KXIP","PBKS")  
                                )               
                        )
              )

In [147]:
def feature_engineering(df):

    return(
        df.assign
        (total_score=
                    df.groupby(['match_id','innings_id'])['runs'].transform('sum'),
         batsman_total_runs=
                    df.groupby(['match_id','batsman1_name'])['wkt_batsman_runs'].transform('sum'),
         batsman_total_balls=
                    df.groupby(['match_id','batsman1_name'])['wkt_batsman_balls'].transform('sum'),
         cumulative_runs=
                    df.groupby(['match_id','innings_id'])['runs'].transform('cumsum'),
         rolling_back_30balls_runs=
                    df.groupby(['match_id','innings_id'])['runs'].rolling(window=30,min_periods=1).sum().reset_index(level=[0,1],drop=True),
         rolling_back_30balls_wkts=
                    df.groupby(['match_id','innings_id'])['wicket_id'].rolling(window=30,min_periods=1).count().reset_index(level=[0,1],drop=True)
         ).rename(columns={'current_innings':'batting_team'})
         [['total_score','batting_team','rolling_back_30balls_runs','rolling_back_30balls_wkts']]
         

        
    )

In [148]:
df_clean=clean_data(df)
final_df=feature_engineering(df_clean)

final_df.head(30)

Unnamed: 0,total_score,batting_team,rolling_back_30balls_runs,rolling_back_30balls_wkts
0,178,CSK,0.0,0.0
1,178,CSK,1.0,0.0
2,178,CSK,1.0,0.0
3,178,CSK,2.0,0.0
4,178,CSK,2.0,0.0
5,178,CSK,2.0,0.0
6,178,CSK,6.0,0.0
7,178,CSK,6.0,0.0
8,178,CSK,10.0,0.0
9,178,CSK,11.0,0.0


In [None]:
final_df.isnull().sum()

season                      0
match_id                    0
match_name                  0
home_team                   0
away_team                   0
current_innings             0
innings_id                  0
over                        0
ball                        0
runs                        0
isBoundary                  0
isWide                      0
isNoball                    0
batsman1_name               0
batsman_runs                0
batsman_balls               0
bowler_name                 0
bowler_overs                0
bowler_maidens              0
bowler_runs                 0
bowler_wkts                 0
wicket_id              230208
wkt_batsman_name       230208
wkt_bowler_name        230208
wkt_batsman_runs       230208
wkt_batsman_balls      230208
wkt_text               230208
isRetiredHurt               0
text                     1372
preText                195926
postText               222410
total_score                 0
batsman_total_runs          0
batsman_to