In [1]:
!pip install numpy pandas seaborn matplotlib

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
dataset = pd.read_csv("../datasets/matches.csv")

In [2]:
# Step 2: Initial data exploration
print("First 5 rows of the dataset:")
dataset.head()

First 5 rows of the dataset:


Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,4/5/2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,4/6/2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,4/7/2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,2017,Indore,4/8/2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,2017,Bangalore,4/8/2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [3]:
# Step 3: Check for missing values
print("\nMissing values in each column:")
print(dataset.isna().sum())



Missing values in each column:
id                   0
season               0
city                 7
date                 0
team1                0
team2                0
toss_winner          0
toss_decision        0
result               0
dl_applied           0
winner               3
win_by_runs          0
win_by_wickets       0
player_of_match      3
venue                0
umpire1              1
umpire2              1
umpire3            636
dtype: int64


In [4]:
# Step 4: Check dataset dimensions
print("\nShape of the dataset:", dataset.shape)


Shape of the dataset: (636, 18)


In [5]:
# Step 5: Drop unnecessary column with too many missing values
dataset.drop(['umpire3'], axis=1, inplace=True)
print("\nColumns after dropping umpire3:")
print(dataset.columns)


Columns after dropping umpire3:
Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2'],
      dtype='object')


In [6]:
# Step 6: Create a mapping dictionary to convert full team names to abbreviations
Teams = {
    'Mumbai Indians': 'MI',
    'Kolkata Knight Riders': 'KKR',
    'Chennai Super Kings': 'CSK',
    'Rajasthan Royals': 'RR',
    'Deccan Chargers': 'DC',
    'Kochi Tuskers Kerala': 'KTK',
    'Pune Warriors': 'PW',
    'Rising Pune Supergiants': 'RPS',
    'Royal Challengers Bangalore': 'RCB',
    'Sunrisers Hyderabad': 'SRH',
    'Rising Pune Supergiant': 'RPS',
    'Gujarat Lions': 'GL',
    'Kings XI Punjab': 'KXIP',
    'Delhi Daredevils': 'DD'
}

In [7]:
# Step 7: Apply the team name mapping to team1 and team2 columns
dataset['team1'] = dataset['team1'].map(Teams)
dataset['team2'] = dataset['team2'].map(Teams)
print("\nDataset after mapping team names:")
print(dataset.head())


Dataset after mapping team names:
   id  season       city      date team1 team2                  toss_winner  \
0   1    2017  Hyderabad  4/5/2017   SRH   RCB  Royal Challengers Bangalore   
1   2    2017       Pune  4/6/2017    MI   RPS       Rising Pune Supergiant   
2   3    2017     Rajkot  4/7/2017    GL   KKR        Kolkata Knight Riders   
3   4    2017     Indore  4/8/2017   RPS  KXIP              Kings XI Punjab   
4   5    2017  Bangalore  4/8/2017   RCB    DD  Royal Challengers Bangalore   

  toss_decision  result  dl_applied                       winner  win_by_runs  \
0         field  normal           0          Sunrisers Hyderabad           35   
1         field  normal           0       Rising Pune Supergiant            0   
2         field  normal           0        Kolkata Knight Riders            0   
3         field  normal           0              Kings XI Punjab            0   
4           bat  normal           0  Royal Challengers Bangalore           15   

   

In [8]:
# Step 8: Analyze player of the match awards
print("\nTop players with most 'Player of the Match' awards:")
print(dataset['player_of_match'].value_counts().head(10))


Top players with most 'Player of the Match' awards:
player_of_match
CH Gayle          18
YK Pathan         16
DA Warner         15
AB de Villiers    15
RG Sharma         14
SK Raina          14
MS Dhoni          13
G Gambhir         13
AM Rahane         12
MEK Hussey        12
Name: count, dtype: int64


In [9]:
# Step 9: Identify unique teams and venues
print("\nUnique teams in the dataset:")
print(dataset['team1'].unique())


Unique teams in the dataset:
['SRH' 'MI' 'GL' 'RPS' 'RCB' 'KKR' 'DD' 'KXIP' 'CSK' 'RR' 'DC' 'KTK' 'PW']


In [10]:
print("\nUnique cities where matches were played:")
print(dataset['city'].unique())


Unique cities where matches were played:
['Hyderabad' 'Pune' 'Rajkot' 'Indore' 'Bangalore' 'Mumbai' 'Kolkata'
 'Delhi' 'Chandigarh' 'Kanpur' 'Jaipur' 'Chennai' 'Cape Town'
 'Port Elizabeth' 'Durban' 'Centurion' 'East London' 'Johannesburg'
 'Kimberley' 'Bloemfontein' 'Ahmedabad' 'Cuttack' 'Nagpur' 'Dharamsala'
 'Kochi' 'Visakhapatnam' 'Raipur' 'Ranchi' 'Abu Dhabi' 'Sharjah' nan]


In [11]:
# Step 10: Find the match with maximum win by runs
max_win_by_runs = dataset['win_by_runs'].max()
print(f"\nMaximum win by runs: {max_win_by_runs}")
print("\nMatch with the highest win by runs:")
print(dataset[dataset['win_by_runs'] == max_win_by_runs])


Maximum win by runs: 146

Match with the highest win by runs:
    id  season   city      date team1 team2       toss_winner toss_decision  \
43  44    2017  Delhi  5/6/2017    MI    DD  Delhi Daredevils         field   

    result  dl_applied          winner  win_by_runs  win_by_wickets  \
43  normal           0  Mumbai Indians          146               0   

   player_of_match             venue      umpire1    umpire2  
43     LMP Simmons  Feroz Shah Kotla  Nitin Menon  CK Nandan  


In [12]:
# Step 11: Find matches with maximum win by wickets
max_win_by_wickets = dataset['win_by_wickets'].max()
print(f"\nMaximum win by wickets: {max_win_by_wickets}")
print("\nMatches with the highest win by wickets:")
print(dataset[dataset['win_by_wickets'] == max_win_by_wickets])


Maximum win by wickets: 10

Matches with the highest win by wickets:
      id  season        city       date team1 team2  \
2      3    2017      Rajkot   4/7/2017    GL   KKR   
34    35    2017  Chandigarh  4/30/2017    DD  KXIP   
71    72    2008      Mumbai  4/27/2008    MI    DC   
119  120    2009   Cape Town  4/19/2009  KXIP    DD   
183  184    2010   Bangalore  3/18/2010    RR   RCB   
298  299    2011      Mumbai  5/20/2011    MI    RR   
376  377    2012      Jaipur  5/20/2012    RR    MI   
390  391    2013  Chandigarh  4/10/2013  KXIP   CSK   
542  543    2015       Delhi  4/26/2015    DD   RCB   
590  591    2016      Rajkot  4/21/2016    GL   SRH   

                     toss_winner toss_decision  result  dl_applied  \
2          Kolkata Knight Riders         field  normal           0   
34               Kings XI Punjab         field  normal           0   
71               Deccan Chargers         field  normal           0   
119             Delhi Daredevils         fie

In [13]:
# Step 12: Visualize toss decisions by season
plt.figure(figsize=(12, 6))
sns.countplot(x='season', hue='toss_decision', data=dataset)
plt.title('Toss Decisions by Season')
plt.xlabel('Season')
plt.ylabel('Count')
plt.savefig('toss_decisions_by_season.png')
plt.close()


In [14]:
# Step 13: Analyze toss winners
toss_winners = dataset['toss_winner'].value_counts()
print("\nTeams with most toss wins:")
print(toss_winners)


Teams with most toss wins:
toss_winner
Mumbai Indians                 85
Kolkata Knight Riders          78
Delhi Daredevils               72
Royal Challengers Bangalore    70
Kings XI Punjab                68
Chennai Super Kings            66
Rajasthan Royals               63
Deccan Chargers                43
Sunrisers Hyderabad            35
Pune Warriors                  20
Gujarat Lions                  15
Kochi Tuskers Kerala            8
Rising Pune Supergiants         7
Rising Pune Supergiant          6
Name: count, dtype: int64


In [15]:
plt.figure(figsize=(12, 6))
toss_winners.plot(kind='bar')
plt.title('Number of Tosses Won by Each Team')
plt.xlabel('Team')
plt.ylabel('Number of Tosses Won')
plt.savefig('toss_winners.png')
plt.close()


In [16]:
# Step 14: Calculate matches played by each team
matches_played_by_teams = pd.concat([dataset['team1'], dataset['team2']], axis=0)
matches_played = matches_played_by_teams.value_counts().reset_index()
matches_played.columns = ['team_name', 'matches_played']
print("\nMatches played by each team:")
print(matches_played)


Matches played by each team:
   team_name  matches_played
0         MI             157
1        RCB             152
2        KKR             148
3       KXIP             148
4         DD             147
5        CSK             131
6         RR             118
7        SRH              76
8         DC              75
9         PW              46
10        GL              30
11       RPS              30
12       KTK              14


In [17]:
# Step 15: Map winner names to abbreviations for consistency
dataset['winner'] = dataset['winner'].map(Teams)

In [18]:
# Step 16: Calculate wins for each team
wins = dataset['winner'].value_counts().reset_index()
wins.columns = ['team_name', 'wins']
print("\nWins by each team:")
print(wins)


Wins by each team:
   team_name  wins
0         MI    92
1        CSK    79
2        KKR    77
3        RCB    73
4       KXIP    70
5         RR    63
6         DD    62
7        SRH    42
8         DC    29
9        RPS    15
10        GL    13
11        PW    12
12       KTK     6


In [19]:
# Step 17: Merge matches played and wins data
team_performance = matches_played.merge(wins, on='team_name', how='inner')
team_performance.columns = ['team', 'matches_played', 'wins']

In [20]:
# Step 18: Calculate win percentage for each team
team_performance['win_percentage'] = (team_performance['wins'] / team_performance['matches_played']) * 100
print("\nTeam performance including win percentage:")
print(team_performance.sort_values('win_percentage', ascending=False))


Team performance including win percentage:
    team  matches_played  wins  win_percentage
5    CSK             131    79       60.305344
0     MI             157    92       58.598726
7    SRH              76    42       55.263158
6     RR             118    63       53.389831
2    KKR             148    77       52.027027
11   RPS              30    15       50.000000
1    RCB             152    73       48.026316
3   KXIP             148    70       47.297297
10    GL              30    13       43.333333
12   KTK              14     6       42.857143
4     DD             147    62       42.176871
8     DC              75    29       38.666667
9     PW              46    12       26.086957


In [21]:

# Step 19: Visualize team performances
plt.figure(figsize=(14, 8))
x = np.arange(len(team_performance['team']))
width = 0.35

fig, ax = plt.subplots(figsize=(14, 8))
matches = ax.bar(x - width/2, team_performance['matches_played'], width, label='Matches Played')
wins = ax.bar(x + width/2, team_performance['wins'], width, label='Matches Won')

ax.set_xlabel('Teams')
ax.set_ylabel('Count')
ax.set_title('Matches Played vs Matches Won by Each Team')
ax.set_xticks(x)
ax.set_xticklabels(team_performance['team'], rotation=45)
ax.legend()

plt.tight_layout()
plt.savefig('team_performance.png')
plt.close()

<Figure size 1400x800 with 0 Axes>

In [24]:

# Step 20: Visualize win percentages
plt.figure(figsize=(12, 6))
sns.barplot(x='team', y='win_percentage', data=team_performance.sort_values('win_percentage', ascending=False))
plt.title('Win Percentage of Each Team')
plt.xlabel('Team')
plt.ylabel('Win Percentage')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('win_percentage.png')
plt.close()