In [1]:
import re
import pandas as pd

In [4]:
new_data="""19.1
1 NB
Wayne Parnell bowling to Shubman Gill
Full toss ball, wide outside off stump, Shubman Gill offers no shot, No-Ball
OVER 19

11 WD61200

190/4

18.6
0
Harshal Patel bowling to Rahul Tewatia

ANOTHER DOT BALL! full length ball, pitching outside off stump, Rahul Tewatia offers no shot
"""

In [5]:
# to clean and process commentary data
def process_cricket_data(data):
    # Pattern to remove text starting with "OVER" followed by one or two digits
    pattern_to_remove = r'OVER \d+.*?(\d+\.\d)'
    cleaned_data = re.sub(pattern_to_remove, r'\1', data, flags=re.DOTALL)

    # Replace newlines with commas and format each line correctly
    data_with_commas = cleaned_data.replace('\n', ',')

    # Ensure each new entry starts with a newline using the adjusted pattern
    pattern_to_start_newline = r'(?<=\D),(\d+\.\d)'
    data_with_newlines = re.sub(pattern_to_start_newline, r'\n\1', data_with_commas)

    # Ensure there are no double commas
    data_single_commas = re.sub(r',,,', ',', data_with_newlines)
    data_single_commas = re.sub(r',,', ',', data_with_newlines)

    # Split the adjusted data by lines
    lines = data_single_commas.strip().split('\n')

    # Split each line by commas for the DataFrame, with the commentary after the third comma
    data_for_df = []
    for line in lines:
        parts = line.split(',', 3)  # Split only on the first three commas
        while len(parts) < 4:
            parts.append('')
        data_for_df.append(parts)

    # Create the DataFrame with the adjusted data
    df = pd.DataFrame(data_for_df, columns=["Over", "Result", "Bowler_Batsman", "Commentary"])
    return df



# Process the data and create the DataFrame
df = process_cricket_data(new_data)

def remove_up_to_exclamation(s):
    # Find the index of '!' and remove everything up to that index if '!' is found
    return s.split('!', 1)[-1] if '!' in s else s

# Apply the function to the 'Bowler_Batsman' column
df['Bowler_Batsman'] = df['Bowler_Batsman'].apply(remove_up_to_exclamation)

# Display the DataFrame
df.to_csv('match1_CSK_1.csv')


In [6]:
df

Unnamed: 0,Over,Result,Bowler_Batsman,Commentary
0,19.1,1 NB,Wayne Parnell bowling to Shubman Gill,"Full toss ball, wide outside off stump, Shubma..."
1,18.6,0,Harshal Patel bowling to Rahul Tewatia,"ANOTHER DOT BALL! full length ball, pitching o..."


In [7]:
df[['Bowler', 'Batsman']] = df['Bowler_Batsman'].str.split(' bowling to ', expand=True)

In [8]:
df

Unnamed: 0,Over,Result,Bowler_Batsman,Commentary,Bowler,Batsman
0,19.1,1 NB,Wayne Parnell bowling to Shubman Gill,"Full toss ball, wide outside off stump, Shubma...",Wayne Parnell,Shubman Gill
1,18.6,0,Harshal Patel bowling to Rahul Tewatia,"ANOTHER DOT BALL! full length ball, pitching o...",Harshal Patel,Rahul Tewatia


In [9]:
# Function to categorize commentary into Length and Line
def categorize_commentary(commentary, length_keywords, line_keywords):
    length_category = None
    line_category = None

    # Checking for Length keywords, with a focus on exact matches
    for keyword in length_keywords:
        if keyword in commentary.lower():
            length_category = keyword
            break

    # Check for Line keywords
    for keyword in line_keywords:
        if keyword in commentary.lower():
            line_category = keyword
            break

    return length_category, line_category

# Function to add Length and Line columns to the DataFrame
def add_length_line_columns(df):
    # Define keywords for Length and Line
    length_keywords = [
        "short of good length","short length ball", "good length", "yorker",
        "bouncer", "bumper", "in the slot", "short ball", "full length",'full toss'
    ]
    line_keywords = [
        "pitching outside off stump", "pitching on off stump",
        "pitching on leg stump", "pitching on middle stump",
        "wide outside off stump", "pitching outside leg stump"
    ]

    # Apply categorization to the dataframe
    df['Length'], df['Line'] = zip(*df['Commentary'].apply(lambda x: categorize_commentary(x, length_keywords, line_keywords)))

    return df

# Example usage
df = add_length_line_columns(df)


In [None]:
df

Unnamed: 0,Over,Result,Bowler_Batsman,Commentary,Bowler,Batsman,Length,Line
0,19.1,1 NB,Wayne Parnell bowling to Shubman Gill,"Full toss ball, wide outside off stump, Shubma...",Wayne Parnell,Shubman Gill,full toss,wide outside off stump
1,18.6,0,Harshal Patel bowling to Rahul Tewatia,"ANOTHER DOT BALL! full length ball, pitching o...",Harshal Patel,Rahul Tewatia,full length,pitching outside off stump


In [10]:
# then I grouped different lengths into one
# like fuller length to full length
# pitching outside off stump to outside off stump
# with str.replace method