In [32]:
# Importing Libraries
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [33]:
spark = SparkSession.builder.master("local").appName('Ops').getOrCreate()

In [35]:
#path = "E:/Rutgers/Projects/MDSR/IPL-MSDR"
path = "/Users/nidhiharwani/Desktop/Most_Valuable_Player_Prediction_using_IPL_Dataset"

# Model Trainning

In [36]:
# Reading data
matches = spark.read.csv(path + '/dataset/clean_data/matches.csv',inferSchema=True,header=True)
deliveries = spark.read.csv(path + '/dataset/clean_data/deliveries.csv',inferSchema=True,header=True)
# Creating temporary tables of the data
matches.registerTempTable('matches_db')
deliveries.registerTempTable('deliveries_db')

In [37]:
# Reading data
matches = pd.read_csv(path + '/dataset/clean_data/matches.csv')
deliveries = pd.read_csv(path + '/dataset/clean_data/deliveries.csv')

In [38]:
encode =  {'team1': {'Mumbai Indians':1,'Kolkata Knight Riders':2,'Royal Challengers Bangalore':3,
                     'Deccan Chargers':4,'Chennai Super Kings':5,'Rajasthan Royals':6,'Delhi Daredevils':7,
                     'Gujarat Lions':8,'Kings XI Punjab':9,'Sunrisers Hyderabad':10,'Rising Pune Supergiant':11,
                     'Kochi Tuskers Kerala':12,'Pune Warriors':13, 'Delhi Capitals':14},
           'team2': {'Mumbai Indians':1,'Kolkata Knight Riders':2,'Royal Challengers Bangalore':3,
                     'Deccan Chargers':4,'Chennai Super Kings':5,'Rajasthan Royals':6,'Delhi Daredevils':7,
                     'Gujarat Lions':8,'Kings XI Punjab':9,'Sunrisers Hyderabad':10,'Rising Pune Supergiant':11,
                     'Kochi Tuskers Kerala':12,'Pune Warriors':13, 'Delhi Capitals':14},
           'toss_winner': {'Mumbai Indians':1,'Kolkata Knight Riders':2,'Royal Challengers Bangalore':3,
                           'Deccan Chargers':4,'Chennai Super Kings':5,'Rajasthan Royals':6,'Delhi Daredevils':7,
                           'Gujarat Lions':8,'Kings XI Punjab':9,'Sunrisers Hyderabad':10,'Rising Pune Supergiant':11,
                           'Kochi Tuskers Kerala':12,'Pune Warriors':13, 'Delhi Capitals':14},
           'winner': {'Mumbai Indians':1,'Kolkata Knight Riders':2,'Royal Challengers Bangalore':3,'Deccan Chargers':4,
                      'Chennai Super Kings':5,'Rajasthan Royals':6,'Delhi Daredevils':7,'Gujarat Lions':8,
                      'Kings XI Punjab':9,'Sunrisers Hyderabad':10,'Rising Pune Supergiant':11,'Kochi Tuskers Kerala':12,
                      'Pune Warriors':13, 'Delhi Capitals':14, 'Draw':15}}
matches.replace(encode, inplace=True)

In [39]:
# Checking the encoding result
matches.head(2)

Unnamed: 0.1,Unnamed: 0,id,season,city,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue
0,0,1,2017,Hyderabad,10,3,3,field,normal,0,10,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal"
1,1,2,2017,Pune,1,11,11,field,normal,0,11,0,7,SPD Smith,Maharashtra Cricket Association Stadium


In [40]:
matches = matches[['team1','team2','city','toss_decision','toss_winner','venue','winner','season']]
df = pd.DataFrame(matches)

In [41]:
label = LabelEncoder()
for _ in ['city','toss_decision','venue']:
    df[_] = label.fit_transform(df[_])
df.dtypes

team1            int64
team2            int64
city             int64
toss_decision    int64
toss_winner      int64
venue            int64
winner           int64
season           int64
dtype: object

In [42]:
# Apply RandomForest
model = RandomForestClassifier(n_estimators=100, max_depth=10)
outcome = ['winner']
predictors = ['team1', 'team2', 'venue', 'toss_winner','city','toss_decision']

model.fit(df[predictors], df[outcome].values.ravel())
predictions = model.predict(df[predictors])
accuracy = metrics.accuracy_score(predictions,df[outcome])
print('Accuracy : {0:.3}%'.format(100*accuracy))

kf = KFold(n_splits=10)
error = []
for train, test in kf.split(df):
    train_predictors = (df[predictors].iloc[train,:])
    train_target = df[outcome].iloc[train]
    model.fit(train_predictors, train_target.values.ravel())
    error.append(model.score(df[predictors].iloc[test,:], df[outcome].iloc[test]))

print('Cross-Validation Score : {0:.3%}'.format(np.mean(error)))
model.fit(df[predictors],df[outcome].values.ravel()) 

Accuracy : 85.1%
Cross-Validation Score : 51.325%


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Creating Player list based on teams player per season

In [43]:
players = spark.sql('select m.id, m.season, d.batting_team, d.bowling_team, d.batsman, d.non_striker, \
                     d.bowler from matches_db m \
                     full outer join deliveries_db d \
                     on m.id=d.match_id \
                     order by m.id')
players.toPandas().to_csv(path + '/dataset/players.csv')

In [44]:
players = pd.read_csv(path + '/dataset/players.csv')
player = set()
for i in range(len(players['id'])):
    player.add((players['season'][i],players['batting_team'][i],players['batsman'][i]))
    player.add((players['season'][i],players['batting_team'][i],players['non_striker'][i]))
    player.add((players['season'][i],players['bowling_team'][i],players['bowler'][i]))
pd.DataFrame(list(player), columns =['season', 'team', 'player']).to_csv(path + '/dataset/players.csv')

In [45]:
players = spark.read.csv(path + '/dataset/players.csv',inferSchema=True,header=True)
total_weight = spark.read.csv(path + '/dataset/weights_data/player_weights.csv',inferSchema=True,header=True)

In [46]:
players.registerTempTable('players')
players.show(10)

+---+------+--------------------+------------+
|_c0|season|                team|      player|
+---+------+--------------------+------------+
|  0|  2018| Sunrisers Hyderabad|      R Bhui|
|  1|  2017|       Gujarat Lions|Ishan Kishan|
|  2|  2010|     Kings XI Punjab|    MS Bisla|
|  3|  2009|Royal Challengers...|   MK Pandey|
|  4|  2013|     Kings XI Punjab|    BA Bhatt|
|  5|  2015|      Mumbai Indians|   AT Rayudu|
|  6|  2017|    Delhi Daredevils| CJ Anderson|
|  7|  2014|      Mumbai Indians|   AT Rayudu|
|  8|  2014| Chennai Super Kings| BB McCullum|
|  9|  2012|      Mumbai Indians|    AN Ahmed|
+---+------+--------------------+------------+
only showing top 10 rows



In [47]:
total_weight.registerTempTable('player_weight')
total_weight.show(10)

+---+-----------------+--------------------+--------------------+------------+
|_c0|           Player|Total_Batting_Weight|Total_Bowling_Weight|Total_Weight|
+---+-----------------+--------------------+--------------------+------------+
|  0|        CH Morris|             4.60416|             3.74057|     8.34473|
|  1|      MF Maharoof|             4.02916|             4.26415|     8.29331|
|  2|      Rashid Khan|             4.14583|             4.12265|     8.26848|
|  3|    Mohammad Nabi|             4.12916|             3.77831|     7.90747|
|  4|        KK Cooper|             3.88334|              3.9953|     7.87864|
|  5|        SP Narine|             3.53749|             4.33491|      7.8724|
|  6|           AJ Tye|             3.69167|             4.08491|     7.77658|
|  7|   A Ashish Reddy|               4.125|             3.51887|     7.64387|
|  8|         M Morkel|             4.04165|             3.58491|     7.62656|
|  9|Washington Sundar|             4.38332|        

In [48]:
team_weight = spark.sql('select * from players p \
                         left join player_weight pw \
                         on p.player = pw.player')
team_weight.registerTempTable('players')

In [49]:
team_weight = spark.sql('select team as Team, season as Season, round(sum(total_batting_weight),5) as Total_Batting_Weight,\
                         round(sum(total_bowling_weight),5) as Total_Bowling_Weight,\
                         round(sum(total_weight),5) as Team_Weight from players \
                         group by team, season \
                         order by season')
team_weight.show(10)
team_weight = team_weight.toPandas()

+--------------------+------+--------------------+--------------------+-----------+
|                Team|Season|Total_Batting_Weight|Total_Bowling_Weight|Team_Weight|
+--------------------+------+--------------------+--------------------+-----------+
|      Mumbai Indians|  2008|            30.03333|            22.85379|   52.88712|
|Kolkata Knight Ri...|  2008|            25.53335|            25.24534|   50.77869|
|     Deccan Chargers|  2008|            24.44164|            22.05191|   46.49355|
|     Kings XI Punjab|  2008|            29.49998|            17.25475|   46.75473|
| Chennai Super Kings|  2008|            29.90831|            14.13682|   44.04513|
|    Rajasthan Royals|  2008|            21.07916|             25.6038|   46.68296|
|    Delhi Daredevils|  2008|            30.54582|            21.67454|   52.22036|
|Royal Challengers...|  2008|            29.63333|            24.30194|   53.93527|
| Chennai Super Kings|  2009|            30.60834|            17.74531|   48

In [50]:
# Converting to csv 
team_weight.to_csv(path + '/dataset/weights_data/team_weights.csv')

## Prediction of Winner

In [51]:
encode = {'Team': {'Mumbai Indians':1,'Kolkata Knight Riders':2,'Royal Challengers Bangalore':3,
                   'Deccan Chargers':4,'Chennai Super Kings':5,'Rajasthan Royals':6,'Delhi Daredevils':7,
                   'Gujarat Lions':8,'Kings XI Punjab':9,'Sunrisers Hyderabad':10,'Rising Pune Supergiant':11,
                   'Kochi Tuskers Kerala':12,'Pune Warriors':13, 'Delhi Capitals':14}}
team_weight.replace(encode, inplace=True)
team_weight.head(10)
team_weight.to_csv(path + '/dataset/weights_data/new_team_weights.csv')

In [52]:
new_team_weight = spark.createDataFrame(df)
new_team_weight.registerTempTable('new_team_weight')
new_team_weight.show(10)

+-----+-----+----+-------------+-----------+-----+------+------+
|team1|team2|city|toss_decision|toss_winner|venue|winner|season|
+-----+-----+----+-------------+-----------+-----+------+------+
|   10|    3|  15|            1|          3|   28|    10|  2017|
|    1|   11|  27|            1|         11|   21|    11|  2017|
|    8|    2|  29|            1|          2|   31|     2|  2017|
|   11|    9|  16|            1|          9|   13|     9|  2017|
|    3|    7|   2|            0|          3|   17|     3|  2017|
|    8|   10|  15|            1|         10|   28|    10|  2017|
|    2|    1|  24|            1|          1|   40|     1|  2017|
|    3|    9|  16|            0|          3|   13|     9|  2017|
|    7|   11|  27|            1|         11|   21|     7|  2017|
|   10|    1|  24|            1|          1|   40|     1|  2017|
+-----+-----+----+-------------+-----------+-----+------+------+
only showing top 10 rows



In [53]:
tw = spark.read.csv(path + '/dataset/weights_data/new_team_weights.csv',inferSchema=True,header=True)
tw.registerTempTable('team_weight1')
tw.show(10)

+---+----+------+--------------------+--------------------+-----------+
|_c0|Team|Season|Total_Batting_Weight|Total_Bowling_Weight|Team_Weight|
+---+----+------+--------------------+--------------------+-----------+
|  0|   5|  2008|            29.90831|            14.13682|   44.04513|
|  1|   7|  2008|            30.54582|            21.67454|   52.22036|
|  2|   9|  2008|            29.49998|            17.25475|   46.75473|
|  3|   4|  2008|            24.44164|            22.05191|   46.49355|
|  4|   6|  2008|            21.07916|             25.6038|   46.68296|
|  5|   1|  2008|            30.03333|            22.85379|   52.88712|
|  6|   2|  2008|            25.53335|            25.24534|   50.77869|
|  7|   3|  2008|            29.63333|            24.30194|   53.93527|
|  8|   1|  2009|            28.94165|            27.09908|   56.04073|
|  9|   2|  2009|            30.11254|            29.24064|   59.35318|
+---+----+------+--------------------+--------------------+-----

In [54]:
training_data = spark.sql('select t2.*,t3.Total_Batting_Weight as team2_batting_wt, \
                           t3.Total_Bowling_Weight as team2_bowling_wt, t3.Team_Weight as team2_merged_wt \
                           from (select ntw.*,t.Total_Batting_Weight as team1_batting_wt, \
                           t.Total_Bowling_Weight as team1_bowling_wt, \
                           t.Team_Weight as team1_merged_wt from new_team_weight ntw \
                           inner join team_weight1 t \
                           on ntw.team1 = t.team where ntw.season = t.season) t2 \
                           inner join team_weight1 t3 \
                           on t2.team2 = t3.team where t2.season = t3.season')
training_data = training_data.toPandas()
training_data.head(10)

Unnamed: 0,team1,team2,city,toss_decision,toss_winner,venue,winner,season,team1_batting_wt,team1_bowling_wt,team1_merged_wt,team2_batting_wt,team2_bowling_wt,team2_merged_wt
0,10,3,15,1,3,28,10,2017,43.56249,30.13685,73.69934,41.03332,28.08966,69.12298
1,1,11,27,1,11,21,11,2017,50.30838,43.52833,93.83671,39.79583,21.217,61.01283
2,8,2,29,1,2,31,2,2017,39.31667,33.93399,73.25066,34.49166,26.5944,61.08606
3,11,9,16,1,9,13,9,2017,39.79583,21.217,61.01283,40.41663,23.96229,64.37892
4,3,7,2,0,3,17,3,2017,41.03332,28.08966,69.12298,36.47501,26.20757,62.68258
5,8,10,15,1,10,28,10,2017,39.31667,33.93399,73.25066,43.56249,30.13685,73.69934
6,2,1,24,1,1,40,1,2017,34.49166,26.5944,61.08606,50.30838,43.52833,93.83671
7,3,9,16,0,3,13,9,2017,41.03332,28.08966,69.12298,40.41663,23.96229,64.37892
8,7,11,27,1,11,21,7,2017,36.47501,26.20757,62.68258,39.79583,21.217,61.01283
9,10,1,24,1,1,40,1,2017,43.56249,30.13685,73.69934,50.30838,43.52833,93.83671


In [55]:
# Applying RandomForest with more features
model = RandomForestClassifier(n_estimators=100)
outcome_var = ['winner']
predictor_var = ['team1','team2','toss_winner','team1_batting_wt','team1_bowling_wt','team1_merged_wt','team2_batting_wt','team2_bowling_wt','team2_merged_wt']

model.fit(training_data[predictors], training_data[outcome].values.ravel())
predictions = model.predict(training_data[predictors])
accuracy = metrics.accuracy_score(predictions,training_data[outcome])
print('Accuracy : {0:.3}%'.format(100*accuracy))

kf = KFold(n_splits=7)
error = []
for train, test in kf.split(training_data):
    train_predictors = (training_data[predictors].iloc[train,:])
    train_target = training_data[outcome].iloc[train]
    model.fit(train_predictors, train_target.values.ravel())
    error.append(model.score(training_data[predictors].iloc[test,:], training_data[outcome].iloc[test]))

print('Cross-Validation Score : {0:.3%}'.format(np.mean(error)))
model.fit(training_data[predictors],training_data[outcome].values.ravel())

Accuracy : 88.0%
Cross-Validation Score : 50.135%


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
team_encoding =  {'Mumbai Indians':1,'Kolkata Knight Riders':2,'Royal Challengers Bangalore':3,'Deccan Chargers':4,
                  'Chennai Super Kings':5,'Rajasthan Royals':6,'Delhi Daredevils':7,'Gujarat Lions':8,
                  'Kings XI Punjab':9,'Sunrisers Hyderabad':10,'Rising Pune Supergiant':11,'Kochi Tuskers Kerala':12,
                  'Pune Warriors':13, 'Delhi Capitals':14, 'Draw':15}

#### Testing Prediction

In [60]:
team1='Sunrisers Hyderabad'
team2='Royal Challengers Bangalore'
toss_winner='Royal Challengers Bangalore'
input=[team_encoding[team1],team_encoding[team2],'28',team_encoding[toss_winner],'15','1']
input = np.array(input).reshape((1, -1))
output=model.predict(input)
print('Winning team is: ')
print(list(team_encoding.keys())[list(team_encoding.values()).index(output)])

Winning team is: 
Sunrisers Hyderabad


In [61]:
team1='Gujarat Lions'
team2='Kolkata Knight Riders'
toss_winner='Kolkata Knight Riders'
input=[team_encoding[team1],team_encoding[team2],'31',team_encoding[toss_winner],'29','1']
input = np.array(input).reshape((1, -1))
output=model.predict(input)
print('Winning team is: ')
print(list(team_encoding.keys())[list(team_encoding.values()).index(output)])

Winning team is: 
Kolkata Knight Riders
