## Importing the Libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import os

## Importing the Libraries

In [3]:
folder_path = './data/WorldCup_Stats'

# Get the all csv files into a list
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

In [4]:
# Combine all DataFrames into crick_df
crick_df = pd.concat([pd.read_csv(os.path.join(folder_path, file)) for file in csv_files], ignore_index=True)

In [5]:
# crick_df.to_csv('crick_df.csv')

## Exploration and Cleaning the crick_df

In [6]:
# Check is there any duplicate values

print(crick_df.duplicated().sum())

0


In [7]:
# Remove Unnamed 0.1, Unnamed 0 and date from this dataset

crick_df = crick_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'date', 'commentary_line'])

In [8]:
crick_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   venue           528 non-null    object 
 1   match_category  528 non-null    object 
 2   team_1          528 non-null    object 
 3   team_2          528 non-null    object 
 4   team_1_runs     518 non-null    float64
 5   team_1_wickets  518 non-null    float64
 6   team_2_runs     513 non-null    float64
 7   team_2_wickets  513 non-null    float64
 8   result          528 non-null    object 
 9   pom             510 non-null    object 
 10  best_batters    250 non-null    object 
 11  best_bowlers    250 non-null    object 
 12  world_cup_year  528 non-null    int64  
 13  host_country    528 non-null    object 
dtypes: float64(4), int64(1), object(9)
memory usage: 57.9+ KB


## Handling Missing Values

Let's consider the `team_1_runs` column.

In [9]:
null_recs = crick_df[crick_df['team_1_runs'].isnull()]
null_recs.head()

Unnamed: 0,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,world_cup_year,host_country
11,The Oval,League-Match,SL,WI,,,,,Match abandoned without a ball bowled,,,,1979,England
24,Colombo (RPS),League-Match,SL,WI,,,,,Sri Lanka won (walkover without a ball bowled),,,,1996,Sri Lanka
37,Colombo (RPS),League-Match,SL,AUS,,,,,Sri Lanka won (walkover without a ball bowled),,,,1996,Sri Lanka
66,Bristol,League-Match,Pakistan,Sri Lanka,,,,,Match abandoned without a ball bowled,,,,2019,England
68,Nottingham,League-Match,India,New Zealand,,,,,Match abandoned without a ball bowled,,,,2019,England


Considering null records in `crick_df['team_1_runs']`,


*   All of the records in the columns `team_1_runs`, `team_2_runs`, `team_1_wickets`, and `team_2_wickets` are missing values.

*   However, the result column indicates that some of these matches were marked as **"Match abandoned."**

Therefore, we cannot remove these records immediately, as we plan to add a new column in the future to indicate the final match status (e.g., whether the match was played or abandoned).


In [10]:
# Let's add 0 values for the values given as match abandoned in result column

# Identify abandoned rows
abandoned_matches = crick_df['result'].str.contains('Match abandoned without a ball bowled', case=False, na=False)

# Update team_1_runs, team_2_runs, team_1_wickets, team_2_wickets to 0 for abandoned matches
crick_df.loc[abandoned_matches, ['team_1_runs', 'team_2_runs', 'team_1_wickets', 'team_2_wickets']] = 0


In [11]:
#Verify Chancges
crick_df[crick_df['result'].str.contains('Match abandoned without a ball bowled', case=False, na=False)].head()

Unnamed: 0,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,world_cup_year,host_country
11,The Oval,League-Match,SL,WI,0.0,0.0,0.0,0.0,Match abandoned without a ball bowled,,,,1979,England
66,Bristol,League-Match,Pakistan,Sri Lanka,0.0,0.0,0.0,0.0,Match abandoned without a ball bowled,,,,2019,England
68,Nottingham,League-Match,India,New Zealand,0.0,0.0,0.0,0.0,Match abandoned without a ball bowled,,,,2019,England
93,Bristol,League-Match,Bangladesh,Sri Lanka,0.0,0.0,0.0,0.0,Match abandoned without a ball bowled,,,,2019,England
440,Brisbane,League-Match,Australia,Bangladesh,0.0,0.0,0.0,0.0,Match abandoned without a ball bowled,,,,2015,Australia


Let's consider the `team_2_runs` column.

In [12]:
null_recs = crick_df[crick_df['team_2_runs'].isnull()]
null_recs.head()

Unnamed: 0,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,world_cup_year,host_country
24,Colombo (RPS),League-Match,SL,WI,,,,,Sri Lanka won (walkover without a ball bowled),,,,1996,Sri Lanka
28,Patna,League-Match,ZIM,KENYA,45.0,3.0,,,No result,,,,1996,India
37,Colombo (RPS),League-Match,SL,AUS,,,,,Sri Lanka won (walkover without a ball bowled),,,,1996,Sri Lanka
103,Southampton,League-Match,SA,WI,29.0,2.0,,,No result,,"['Q de Kock - 17 runs ', 'HM Amla - 6 runs ']","['SS Cottrell - 2', 'O Thomas - 0']",2019,England
323,Colombo (RPS),League-Match,SL,AUS,146.0,3.0,,,No result,,"['KC Sangakkara - 73 runs ', 'TT Samaraweera -...","['SW Tait - 1', 'B Lee - 1']",2011,Sri Lanka


Considering null records in `crick_df['team_2_runs']` and Ignoring the abandoned records,

*   Some of the records in the columns `team_1_runs`, `team_2_runs`, `team_1_wickets`, and `team_2_wickets` are missing.

*   Some `team_1_runs`, `team_1_wickets` contains values. But in the result column mention as the **No Result**

Therefore, we cannot get any clear idea through this records.We can remove them


In [13]:
# Filter rows

null_team_2_runs = crick_df[
    crick_df['team_2_runs'].isnull() & ~crick_df['result'].str.contains('abandoned', case=False, na=False)
]

null_team_2_runs.head()

Unnamed: 0,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,world_cup_year,host_country
24,Colombo (RPS),League-Match,SL,WI,,,,,Sri Lanka won (walkover without a ball bowled),,,,1996,Sri Lanka
28,Patna,League-Match,ZIM,KENYA,45.0,3.0,,,No result,,,,1996,India
37,Colombo (RPS),League-Match,SL,AUS,,,,,Sri Lanka won (walkover without a ball bowled),,,,1996,Sri Lanka
103,Southampton,League-Match,SA,WI,29.0,2.0,,,No result,,"['Q de Kock - 17 runs ', 'HM Amla - 6 runs ']","['SS Cottrell - 2', 'O Thomas - 0']",2019,England
323,Colombo (RPS),League-Match,SL,AUS,146.0,3.0,,,No result,,"['KC Sangakkara - 73 runs ', 'TT Samaraweera -...","['SW Tait - 1', 'B Lee - 1']",2011,Sri Lanka


In [14]:
records_to_remove = null_team_2_runs.index

# Remove the records
crick_df = crick_df.drop(records_to_remove)

In [15]:
crick_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 518 entries, 0 to 527
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   venue           518 non-null    object 
 1   match_category  518 non-null    object 
 2   team_1          518 non-null    object 
 3   team_2          518 non-null    object 
 4   team_1_runs     518 non-null    float64
 5   team_1_wickets  518 non-null    float64
 6   team_2_runs     518 non-null    float64
 7   team_2_wickets  518 non-null    float64
 8   result          518 non-null    object 
 9   pom             510 non-null    object 
 10  best_batters    248 non-null    object 
 11  best_bowlers    248 non-null    object 
 12  world_cup_year  518 non-null    int64  
 13  host_country    518 non-null    object 
dtypes: float64(4), int64(1), object(9)
memory usage: 60.7+ KB


## Adding New Columns

match_status

In [16]:
crick_df['match_status'] = crick_df['result'].apply(
    lambda x: 'abandoned' if 'match abandoned without a ball bowled' in x.lower() else 'played'
)

winning_team

In [18]:
def get_winning_team(result):
    # Check if the word "won" exists in the result
    if isinstance(result, str) and 'won' in result.lower():
      
        return result.lower().split('won')[0].strip()
    # Return an empty string if "won" is not present
    return ''  

crick_df['winning_team'] = crick_df['result'].apply(get_winning_team)

best_batters and best_bowlers Columns