# Lets Data hit SIX

Working on different data sets to analyse and predict how cricket player will perform in a given match

### Step 1 : Data Loading and Understanding

In [None]:
import pandas as pd
import numpy as np

In [None]:
batting_data=pd.read_csv('D:\\Cricket Data Analysis\\Data\\fact_bating_summary.csv')
bowling_data=pd.read_csv('D:\\Cricket Data Analysis\\Data\\fact_bowling_summary.csv')
match_data=pd.read_csv('D:\\Cricket Data Analysis\\Data\\dim_match_summary.csv')
player_info=pd.read_csv('D:\\Cricket Data Analysis\\Data\\dim_players.csv')


In [None]:
# Batting Data
batting_data.head()

In [None]:
batting_data.dtypes

In [None]:
batting_data['SR']=batting_data['SR'].replace('-',0)

In [None]:
def handle(value):
    return float(value.replace('',np.NaN).strip())

In [None]:
batting_data['SR']=batting_data['SR'].astype(float)

In [None]:
batting_data.shape

In [None]:
batting_data.isna().sum()

In [None]:
# Bowling Data

In [None]:
bowling_data.head()

In [None]:
bowling_data.dtypes

In [None]:
bowling_data.shape

In [None]:
# Match Data 


In [None]:
match_data.head()

In [None]:
match_data.dtypes

In [None]:
match_data.shape

In [None]:
match_data.isna().sum()

In [None]:
match_data['margin']=match_data['margin'].fillna('Not_Available')

In [None]:
# Player info

In [None]:
player_info.head()

In [None]:
player_info.drop(columns=['image'],axis=1,inplace=True)

In [None]:
player_info.columns

In [None]:
player_info.dtypes

In [None]:
player_info.shape

## Step 2: Merge Data 

In [None]:
merge_1=match_data.merge(batting_data,on='match_id')

In [None]:
merged_data=merge_1.merge(bowling_data,on='match_id')

In [None]:
merged_data.head()

In [None]:
merged_data.shape

## Exploratory Data Analysis

In [None]:
match_data=match_data.merge(batting_data,on='match_id')

In [None]:
match_data.head()

In [None]:
match_data.rename(columns={'team1':'TeamA','team2':'TeamB','winner':'Winner','margin':'Won_by','ground':'Location','matchDate':'Match_Date','match_id':'Match_id','match':'Match','teamInnings':'Playing','battingPos':'Batting_Position','runs':'Runs'},inplace=True)

In [None]:
# Lets plot a histplot how many players scored how much runs 
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
sns.histplot(match_data['Runs'],bins=30)
plt.title('Distrubution how much runs each player scored per match')
plt.xlabel('Runs')
plt.ylabel('Frequency')

In [None]:
# This shows that a lot of batsman failed to perform well only a few number of batsman score above 40 
# Top Scorers
top_scrores=match_data.sort_values(by='Runs',ascending=False)

In [None]:
top_run_scores=top_scrores.head(10)

In [None]:
# Most Consistent
consistent=match_data[(match_data['Batting_Position'] < 5) & (match_data['balls'] > 10)]
consistent=consistent.groupby('batsmanName')['SR'].mean().reset_index().sort_values(by='SR',ascending=False)

In [None]:
consistent_players=consistent['batsmanName'].to_list()

In [None]:
print(f'Top Batsman with consitent strike for T20 World Cup are ')
for i,player in enumerate(consistent_players[0:10]):
    print(f'{i+1}){player}')

In [None]:
plt.bar(consistent_players[0:3],consistent['SR'][0:3])
plt.xlabel('BatsMan Name')
plt.ylabel('Strike Rate')
plt.title('Top 3 batsman of 2020 T20I World Cup')


In [None]:
# Most wins by
winners=match_data.groupby('Match')['Winner'].value_counts().reset_index()

In [None]:
winners.drop(columns='count',inplace=True)

In [None]:
new_=winners['Winner'].value_counts().reset_index()

In [None]:
new_.sort_values(by='count',ascending=False,inplace=True)

In [None]:
plt.bar(list(new_['Winner'].unique()[0:4]),new_['count'][0:4])
plt.xlabel('Teams')
plt.ylabel('Number of Wins')
plt.title('Team with most wins')


In [None]:
# England Pakistan SriLanka and India appears to be stronger team to win
location=match_data.groupby('Location')['Runs'].sum().reset_index()

In [None]:
innings_played_stadium=[]
for loca in location['Location'].to_list() :
 innings_played_stadium.append(len(match_data[match_data['Location']==loca]['Match_id'].unique()))

In [None]:
avg_score_at_pitches=[(x/y)//2 for x,y in zip(location['Runs'].to_list(),innings_played_stadium) ]

In [None]:
avg_score_at_pitches

In [None]:
avg_scores={} 
for x,y in zip(location['Location'].to_list(),avg_score_at_pitches):
  avg_scores[x]=y

In [None]:
for key,value in avg_scores.items():
    print(f'Average Score of First Innings at {key} is {value}')

In [None]:
plt.bar(location['Location'].to_list(),avg_score_at_pitches)
plt.title('Average Scores each innings at different stadium ')
plt.xlabel('Stadium')
plt.ylabel('Score')

In [None]:
# Oppositon
teams=list(match_data['Playing'].unique())

In [None]:
match_data_1=pd.read_csv('D:\\Cricket Data Analysis\\Data\\dim_match_summary.csv')

In [None]:
# Now for bowlers
bowling_data=bowling_data.merge(match_data_1,on='match_id')

In [None]:
sns.histplot(bowling_data['wickets'])
plt.title('Distribution of Wickets')
plt.xlabel('Wickets')
plt.ylabel('Frequency')

In [None]:
# This graph indicates most of the bowlers failed to take wickets around 175+ meanwhile no bowler took 5 wickets 

In [None]:
# most wickets by bowler
most_wickets=bowling_data.groupby('bowlerName')['wickets'].sum().reset_index()

In [None]:
sorted_=most_wickets.sort_values(by='wickets',ascending=False)

In [None]:
# Top 10 bowler with most wickets are 
print('Top 10 bowlers with most wicket')
for i,name in enumerate(sorted_['bowlerName'].to_list()[0:10]):
    print(f'{i+1}){name}')

In [None]:
plt.bar(sorted_['bowlerName'][0:3],sorted_['wickets'][0:3])
plt.xlabel('Player Name')
plt.ylabel('Wickets Taken')
plt.title('Top 3 wicket taking bowlers in t20 world cup 2022')

In [None]:
# Average bowling economy of each team 
bowling_economy=bowling_data.groupby('bowlingTeam')['economy'].mean().reset_index()

In [None]:
# Most wicket taking team 
bowling_wickets=bowling_data.groupby('bowlingTeam')['wickets'].sum().reset_index()


In [None]:
# Lets see average runs scored by each team 
average_run=match_data.groupby('Playing')['Runs'].sum().reset_index()

In [None]:
# Matches Played by each team 
matches_played=match_data.groupby('Match_id')['TeamA'].value_counts().reset_index()

In [None]:
nd_=matches_played['TeamA'].value_counts().reset_index()

In [None]:
# Matches Played by each team 
matches_played_B=match_data.groupby('Match_id')['TeamB'].value_counts().reset_index()

In [None]:
matches_played_B.rename(columns={'TeamB':'TeamA'},inplace=True)

In [None]:
nd_1=matches_played_B['TeamA'].value_counts().reset_index()

In [None]:
matches_play=nd_.merge(nd_1,on='TeamA',how='outer')

In [None]:
matches_play.replace(np.NaN,float(0),inplace=True)

In [None]:
matches_play['Match Played']=matches_play['count_x']+matches_play['count_y']

In [None]:
matches_play.drop(columns=['count_x','count_y'],inplace=True)

In [None]:
def handles(value):
    return int(value)

In [None]:
matches_play['Match Played']=matches_play['Match Played'].apply(handles)

In [None]:
matches_play.rename(columns={'TeamA':'Teams'},inplace=True)

In [None]:
average_run.rename(columns={'Playing':'Teams'},inplace=True)

In [None]:
strong=average_run.merge(matches_play,on='Teams')

In [None]:
strong['Average Score Per Innings']=strong['Runs']//strong['Match Played']

In [None]:
bowling_wickets.rename(columns={'bowlingTeam':'Teams'},inplace=True)

In [None]:
team=strong.merge(bowling_wickets,on='Teams')

In [None]:
bowling_economy.rename(columns={'bowlingTeam':'Teams'},inplace=True)

In [None]:
team=team.merge(bowling_economy,on='Teams')

In [None]:
team['Economy']=round(team['economy'],2)

In [None]:
team.drop(columns=['economy'],inplace=True)

In [None]:
new_.rename(columns={'Winner':'Teams'},inplace=True)

In [None]:
team=team.merge(new_,on='Teams',how='outer')

In [None]:
team.rename(columns={'count':'Wins'},inplace=True)

In [None]:
team['Wins']=team['Wins'].fillna(float(0))

In [None]:
team.dropna(inplace=True)

In [None]:
team

In [None]:
team=team.sort_values(by=['Average Score Per Innings','wickets'],ascending=False).reset_index()

In [None]:
team.drop(columns=['index'],inplace=True)

In [None]:
# Top 3 player with average innings per score
plt.bar(team['Teams'][0:3].to_list(),team['Average Score Per Innings'][0:3].to_list())
plt.xlabel('Team name')
plt.ylabel('Average Score Per Innings')
plt.title('Team with top average score per innings')

In [None]:
plt.bar(team['Teams'][0:3].to_list(),team['Economy'][0:3].to_list())
plt.xlabel('Team name')
plt.ylabel('Economy')
plt.title('Team with Good Economy')

## Feature Engineering

In [None]:
# Lets predict player would score in a match 
batting=pd.read_csv('D:\\Cricket Data Analysis\\Data\\fact_bating_summary.csv')

In [None]:
batting=batting[batting['battingPos']<8]

In [None]:
batting['SR']=batting['SR'].replace('-',0)

In [None]:
batting['SR']=batting['SR'].astype(float)

In [None]:
batting.select_dtypes(include=['int64','float64']).corr()

In [None]:
batting.columns

In [None]:
batting=batting.groupby('batsmanName')[['runs','4s','6s']].sum().reset_index()

In [None]:
batting.head()

## Model Development

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(batting[['4s','6s']],batting['runs'],test_size=0.2,random_state=42)

In [None]:
lr=LinearRegression()


In [None]:
lr.fit(xtrain,ytrain)

In [None]:
lr.coef_

In [None]:
lr.intercept_

In [None]:
yhat=lr.predict(xtest)

### Model Evaluation

In [None]:
sns.kdeplot(ytest,color='r')
sns.kdeplot(yhat,color='b')
plt.title('Comparision Between actual Values and Predicted values')

In [None]:
lr.score(xtrain,ytrain)

In [None]:
lr.score(xtest,ytest)

In [None]:
from sklearn.model_selection import cross_val_score
rcross=cross_val_score(lr,batting[['4s','6s']],batting['runs'],cv=4)

In [None]:
rcross

In [None]:
alpha=[{'alpha':[0.1,1,10,1000]}]


## Ridge Regression Model

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
grid=GridSearchCV(Ridge(),param_grid=alpha)

In [None]:
scaler=StandardScaler()

In [None]:
grid.fit(scaler.fit_transform(xtrain),ytrain)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
grid.best_estimator_

In [None]:
predict=grid.predict(scaler.fit_transform(xtest))

### Model Evaluation

In [None]:
sns.kdeplot(ytest,color='r')
sns.kdeplot(predict,color='b')
plt.title('Comparision Between actual Values and Predicted values')

In [None]:
grid.score(scaler.fit_transform(xtest),ytest)

## Compare Models

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# For regression Model 
print(f'MSE : {round(mean_squared_error(yhat,ytest),2)} , R2 value : {round(lr.score(xtest,ytest),2)}')

In [None]:
# For Ridge Model
print(f'MSE : {round(mean_squared_error(ytest,predict),2)} , R2 value : {round(grid.score(scaler.fit_transform(xtest),ytest),2)}')

### Model Dumping

In [None]:
import joblib
with open('model.pkl','wb') as f :
    joblib.dump(lr,f)

In [None]:
with open('scaler.pkl','wb') as f:
    joblib.dump(scaler,f)