In [None]:
import numpy as np # numerical computing 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #visualization
import seaborn as sns #modern visualization
import boto3

In [None]:
sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = (14, 8)

In [None]:
# Read Data from S3. Edit your AWS Key and Secret
AWS_ACCESS_KEY_ID="XXXXXXXXXXXXXXXX"
AWS_SECRET_ACCESS_KEY="XXXXXXXXXXXXXXXXXXXXXXX"
BUCKET="aws-analytics-course"
KEY="raw/ipl/matches.csv"

s3 = boto3.resource('s3', 
                    aws_access_key_id=AWS_ACCESS_KEY_ID,
                    aws_secret_access_key=AWS_SECRET_ACCESS_KEY, 
                    region_name='us-east-1'
                      )

s3.Bucket(BUCKET).download_file(KEY, 'matches.csv')

In [None]:
matches = pd.read_csv('matches.csv')

In [None]:
# find the number of rows and columns of the data set
matches.shape

In [None]:
# find different types of data/variables in the given dataset
matches.info()

In [None]:
# perform simple summary statistics 
matches.describe()

In [None]:
# lets see a see a couple of actual rows of the input dataset
matches.head(2)

In [None]:
# How many matches were played
matches['id'].max()

In [None]:
# How many IPL seasons have been played
matches['season'].unique()

In [None]:
# Which Team had won by maximum runs?
matches.iloc[matches['win_by_runs'].idxmax()]

In [None]:
# Just show me the winning team from above query
matches.iloc[matches['win_by_runs'].idxmax()]['winner']

In [None]:
# Which Team had won by (closest margin) minimum runs?
matches.iloc[matches[matches['win_by_runs'].ge(1)].win_by_runs.idxmin()]['winner']

In [None]:
# Which Team had won by minimum wickets?
matches.iloc[matches[matches['win_by_wickets'].ge(1)].win_by_wickets.idxmin()]

In [None]:
# Which season had most number of matches?
sns.countplot(x='season', data=matches)
plt.show()

In [None]:
# Which is the most successful IPL Team
data = matches.winner.value_counts()
sns.barplot(y = data.index, x = data, orient='h');

In [None]:
# Which is the Top player of the match Winners
top_players = matches.player_of_match.value_counts()[:10]
#sns.barplot(x="day", y="total_bill", data=tips)
fig, ax = plt.subplots()
ax.set_ylim([0,20])
ax.set_ylabel("Count")
ax.set_title("Top player of the match Winners")
#top_players.plot.bar()
sns.barplot(x = top_players.index, y = top_players, orient='v'); #palette="Blues");
plt.show()

In [None]:
# Has Toss-winning helped in Match-winning?
ss = matches['toss_winner'] == matches['winner']
ss.groupby(ss).size()
sns.countplot(ss);