# 1. OverView of Dataset

## Columns details:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Data Preprocessing:

## A. Read CSV data

In [None]:
df = pd.read_csv("matches.csv")

## B. Columns Overview:

In [None]:
df.columns

## C. Check missing Value

In [None]:
df.isnull().sum()

## D. Handle missing Value:

In [None]:
df['umpire2'] = df['umpire2'].fillna('Unknown')
df['umpire1'] = df['umpire1'].fillna('Unknown')

df['city'] = df['city'].fillna(df['city'].mode()[0])
df['winner'] = df['winner'].fillna(df['winner'].mode()[0])

df.isnull().sum()

## E. Replace Data to short Form

In [None]:
replace_map = {
    'Sunrisers Hyderabad': 'SRH',
    'Mumbai Indians': 'MI',
    'Gujarat Lions': 'GT',
    'Rising Pune Supergiant': 'RPS',
    'Royal Challengers Bangalore': 'RCB',
    'Kolkata Knight Riders': 'KKR',
    'Delhi Daredevils': 'DD',
    'Kings XI Punjab': 'PBKS',
    'Chennai Super Kings': 'CSK',
    'Rajasthan Royals': 'RR',
    'Deccan Chargers': 'DCS',
    'Kochi Tuskers Kerala': 'KTK',
    'Pune Warriors': 'PW',
    'Rising Pune Supergiants': 'RPS',
    'Delhi Capitals': 'DC',
}
df = df.replace(replace_map)
df

In [None]:
df.head()

In [None]:
df.tail()

## F. Data types of all columns

In [None]:
print(df.dtypes)

## G. Data information of all Columns

In [None]:
df.info()

## H. Check and Handle Duplicates

In [None]:
df.duplicated().sum()
df = df.drop_duplicates()
df

## I. Describe dataset

In [None]:
df.describe()

## J. shape of Dataset

In [None]:
df.shape

## K. All Seasion

In [None]:
#Finding the number of seasons
df['Season'].unique()

## L. Finding which IPL team scored the maximum runs

In [None]:
df.iloc[df['win_by_runs'].idxmax()]

## M. which team won by scoring maximum wickets

In [None]:
df.iloc[df['win_by_wickets'].idxmax()]

# 3. EDA (Exploratory Data Analysis):

## Univariant

### Q1.Which season had the most matches?

In [None]:
sns.countplot(x = 'Season', data = df, hue='Season', palette='jet_r')
plt.xticks(rotation = 45)
plt.xlabel('Seasion in year')
plt.ylabel('Total number of Season')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

##### Q2. Which city hosted the most matches?

In [None]:
sns.countplot(x = 'city', data = df, hue='city', palette='jet_r')
plt.xticks(rotation = 90)
plt.title('Matches Hosted per City')
plt.xlabel('City')
plt.ylabel('Total Number of Matches')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

### Q3. Which team appears most frequently as Team1?

In [None]:
team1_counts = df['team1'].value_counts()
team1_counts.plot(kind='pie', autopct='%1.1f%%', radius=1.5)
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

### Q4. Which team appears most frequently as Team2?

In [None]:
team2_counts = df['team2'].value_counts()
team2_counts.plot(kind='bar')
plt.xlabel('Team2')
plt.ylabel('Number of Matches')
plt.title('Frequency of Team2 Appearances')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


### Q5. Which team won the toss most often?

In [None]:
sns.countplot(x = 'toss_winner', data = df, hue='toss_winner', palette='jet_r')
plt.xticks(rotation = 90)
plt.title('Toss Wins by Team')
plt.xlabel('Teams')
plt.ylabel('Number of Toss Wins')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


### Q6. Which toss decision was chosen most often (bat/field)

In [None]:
sns.countplot(x = 'toss_decision', data = df, hue='toss_decision', palette='jet_r')
plt.title('Toss Decision Counts')
plt.xlabel('Toss Decision')
plt.ylabel('Frequency')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

### Q7. Which result type occurred most frequently (normal/tie/no result)

In [None]:
result_counts = df['result'].value_counts()
result_counts.plot(kind='bar')
plt.xlabel('Result Type')
plt.ylabel('Frequency')
plt.title('Match Result Counts')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


### Q8. Which team won most matches?

In [None]:
sns.countplot(x = 'winner', data = df, hue='winner', palette='jet_r')
plt.xticks(rotation = 90)
plt.title('Matches Won by Team')
plt.xlabel('Teams')
plt.ylabel('Number of Wins')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


### Q9. Which stadium hosted the most matches?

In [None]:
venue_counts = df['venue'].value_counts()
venue_counts.plot(kind='bar', fontsize=8)
plt.xlabel('Venue')
plt.ylabel('Number of Matches')
plt.title('Matches Hosted per Venue')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


### Q10. Distribution of win margins by runs

In [None]:
df['win_by_runs'].plot(kind='hist')
plt.xlabel('Win by Runs')
plt.ylabel('Frequency')
plt.title('Distribution of Win Margins by Runs')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

### Q11. Distribution of win margins by Wickets

In [None]:
df['win_by_wickets'].plot(kind='hist')
plt.xlabel('Win by Wickets')
plt.ylabel('Frequency')
plt.title('Distribution of Win Margins by Runs')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


### Q12. Top 10 Most frequently used umpire1. 

In [None]:
z = df['umpire1'].value_counts().head(10)
plt.pie(z, labels = z.index, autopct='%1.1f%%')

plt.title('Proportion of Umpire1 Selection')
plt.savefig("1.png", bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


### Q13. Top 10 Most frequently used umpire2. 

In [None]:
x = df['umpire2'].value_counts().head(10)
plt.pie(x, labels = x.index, autopct='%1.1f%%', radius=1.2)

plt.savefig("1.png", bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

# Bivariate 

## Q1. Are thrillers becoming more common?

In [None]:
thrillers = df[
    (
        ((df['win_by_runs'] > 0) & (df['win_by_runs'] <= 10)) |
        ((df['win_by_wickets'] > 0) & (df['win_by_wickets'] <= 2))
    )
]

thrillers_per_season = thrillers['Season'].value_counts().sort_index()

matches_per_season = df['Season'].value_counts().sort_index()

thriller_ratio = (thrillers_per_season / matches_per_season * 100).fillna(0)

plt.figure(figsize=(10,6))
plt.plot(thrillers_per_season.index, thrillers_per_season.values, marker='o', label="Thriller Matches")
plt.plot(thriller_ratio.index, thriller_ratio.values, marker='s', linestyle="--", label="Thriller % of Season")

plt.title("Are Thrillers Becoming More Common in IPL?")
plt.xlabel("Season")
plt.ylabel("Number of Matches / % of Season")
plt.legend()
plt.grid(True)
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


## Q2. Does batting/fielding first matter?

In [None]:
df['toss_match_result'] = df.apply(
    lambda row: "Won Match" if row['toss_winner'] == row['winner'] else "Lost Match",
    axis=1
)

cross_tab = pd.crosstab(df['toss_decision'], df['toss_match_result'])

cross_tab.plot(kind="bar", stacked=True, figsize=(8,6), color=["salmon","skyblue"])

plt.title("Does Batting/Fielding First Matter?")
plt.xlabel("Toss Decision")
plt.ylabel("Number of Matches")
plt.legend(title="Match Result")
plt.xticks(rotation=0)

plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


## Q3. Are matches ending in "No Result" increasing?

In [None]:
no_result_per_season = df[df['result'] == 'no result']['Season'].value_counts().sort_index()

plt.figure(figsize=(10,6))
no_result_per_season.plot(kind='bar', color='red')

plt.title("No Result Matches per Season")
plt.xlabel("Season")
plt.ylabel("Number of No Results")
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

## Q4. How often do matches go into Super Overs (Ties)?

In [None]:
super_over_per_season = df[df['result'] == 'tie']['Season'].value_counts().sort_index()

plt.figure(figsize=(10,6))
super_over_per_season.plot(kind='line', marker='o', color="purple")
plt.title("Super Over (Tied) Matches per Season")
plt.xlabel("Season")
plt.ylabel("Number of Super Overs")
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

## Q5. Which teams dominate at home vs away venues?

In [None]:
home_wins = df[df['winner'] == df['team1']]['winner'].value_counts()
away_wins = df[df['winner'] == df['team2']]['winner'].value_counts()

pd.DataFrame({"Home Wins": home_wins, "Away Wins": away_wins}).plot(
    kind='bar', figsize=(12,6), color=["green", "orange"]
)

plt.title("Home vs Away Wins per Team")
plt.ylabel("Matches Won")
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()


## Q.6 Who wins Player of the Match in finals most often?

In [None]:
finals = df[df['match_type'] == 'Final'] if 'match_type' in df.columns else df[df['Season'] == df['Season'].max()]

final_awards = finals['player_of_match'].value_counts()

plt.figure(figsize=(8,5))
final_awards.plot(kind='bar', color="gold")
plt.title("Player of the Match Awards in Finals")
plt.ylabel("Count")
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

## Q7. Impact of Toss on Match Result

In [None]:
df['same'] = df['toss_winner'] == df['winner']
df['toss_match_result'] = df['same'].map({True: 'Toss Winner Also Won', False: 'Toss Winner Lost'})

sns.countplot(x='toss_match_result', data=df, hue='toss_match_result')
plt.title("Did Toss Winner Also Win the Match?")
plt.xlabel("Result")
plt.ylabel("Number of Matches")
plt.savefig("1.png", bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

## Q8. Are some teams more successful under specific umpires?

In [None]:
umpire_winner = df.groupby(['umpire1', 'winner']).size().reset_index(name='wins')
umpire_winner_sorted = umpire_winner.sort_values(by='wins', ascending=False).head(7)

plt.figure(figsize=(10,6))
sns.barplot(y='umpire1', x='wins', hue='winner', data=umpire_winner_sorted, dodge=False)
plt.title("Top 7 Umpire-Team Winning Combinations")
plt.xlabel("Number of Wins")
plt.ylabel("Umpire")
plt.savefig("1.png", bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

# Multivariate

## Q1. Across seasons, which teams win more by batting first (runs) vs chasing (wickets)?

In [None]:
df['win_type'] = df.apply(lambda x: "By Runs" if x['win_by_runs'] > 0 else "By Wickets", axis=1)

# Group by Season + Winner + Win Type
season_team_wins = df.groupby(['Season', 'winner', 'win_type']).size().reset_index(name='count')

plt.figure(figsize=(14,7))
sns.scatterplot(
    data=season_team_wins,
    x="Season", y="count", hue="win_type", style="winner", s=150
)

plt.title("Season + Winning Team + Win Type", fontsize=14)
plt.xlabel("Season")
plt.ylabel("Wins Count")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.savefig("1.png",  bbox_inches="tight", dpi = 300, transparent=True)
plt.show()

# Outliers

In [None]:
plt.figure(figsize=(6, 5))
sns.boxplot(y = df['win_by_wickets'])
plt.show()

In [None]:
Q1 = df['win_by_wickets'].quantile(0.25)
Q3 = df['win_by_wickets'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

mean_val = df[(df['win_by_wickets'] >= lower) & (df['win_by_wickets'] <= upper)]['win_by_wickets'].mean()

print(out_win_by_run['win_by_wickets'])

df['win_by_wickets'] = df['win_by_wickets'].where(
    (df['win_by_wickets'] >= lower) & (df['win_by_wickets'] <= upper),
    mean_val
)

In [None]:
plt.figure(figsize=(6, 5))
sns.boxplot(y = df['win_by_wickets'])
plt.show()