In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [129]:
prev_wc_df = pd.read_csv('./wc_stats.csv')
odi_results_df = pd.read_csv('./odi_results.csv')

In [130]:
prev_wc_df.head()

Unnamed: 0,Team,Group,Previous \nappearances,Previous \r\ntitles,Previous\r\n finals,Previous\r\n semifinals,Current \r rank
0,England,A,12,1,4,6,7
1,South Africa,A,7,0,0,5,2
2,Netherlands,A,5,0,0,0,10
3,Pakistan,A,12,1,2,6,5
4,New Zealand,A,12,0,2,9,4


In [131]:
odi_results_df.head()

Unnamed: 0,date,Team_1,Team_2,Winner,Margin,Ground
0,"Jan 12, 2011",South Africa,India,South Africa,135 runs,Durban
1,"Jan 15, 2011",South Africa,India,India,1 run,Johannesburg
2,"Jan 16, 2011",Australia,England,Australia,6 wickets,Melbourne
3,"Jan 18, 2011",South Africa,India,India,2 wickets,Cape Town
4,"Jan 21, 2011",Australia,England,Australia,46 runs,Hobart


In [132]:
# Restricting to the teams in the 2023 World Cup.
worldcup_teams = ['England', 'South Africa', 'Netherlands', 'Pakistan', 'New Zealand', 'Sri Lanka', 'Afghanistan', 'Australia', 'Bangladesh', 'India']

results_team_1 = odi_results_df.loc[odi_results_df['Team_1'].isin(worldcup_teams)]
results_team_2 = odi_results_df.loc[odi_results_df['Team_2'].isin(worldcup_teams)]

results_df = pd.concat([results_team_1, results_team_2])

In [133]:
new_results_df = results_df.drop(['date','Margin', 'Ground'], axis=1)
new_results_df

Unnamed: 0,Team_1,Team_2,Winner
0,South Africa,India,South Africa
1,South Africa,India,India
2,Australia,England,Australia
3,South Africa,India,India
4,Australia,England,Australia
...,...,...,...
1572,India,Australia,India
1573,Bangladesh,New Zealand,New Zealand
1575,India,Australia,India
1576,Bangladesh,New Zealand,New Zealand


In [134]:
# Convert 'Team_1' and 'Team_2' from categorical variables to continuous inputs
final_encoded_df = pd.concat([pd.get_dummies(new_results_df['Team_1'], prefix='Team_1'),
                           pd.get_dummies(new_results_df['Team_2'], prefix='Team_2'),
                           new_results_df.drop(['Team_1', 'Team_2'], axis=1)], axis=1)


X = final_encoded_df.drop(['Winner'], axis=1)
y = final_encoded_df['Winner']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [135]:
final_encoded_df.head()

Unnamed: 0,Team_1_Afghanistan,Team_1_Australia,Team_1_Bangladesh,Team_1_Canada,Team_1_England,Team_1_Hong Kong,Team_1_India,Team_1_Ireland,Team_1_Kenya,Team_1_Nepal,...,Team_2_Oman,Team_2_Pakistan,Team_2_Scotland,Team_2_South Africa,Team_2_Sri Lanka,Team_2_U.A.E.,Team_2_U.S.A.,Team_2_West Indies,Team_2_Zimbabwe,Winner
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,South Africa
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,India
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Australia
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,India
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Australia


In [136]:
# GridSearch

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    # Add other parameters you want to tune
}

rf = RandomForestClassifier(random_state=0)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': 10, 'n_estimators': 50}


In [137]:
# Get the best estimator from the grid search
best_rf = grid_search.best_estimator_

train_score = best_rf.score(X_train, y_train)
print("Training Set Accuracy:", train_score)
test_score = best_rf.score(X_test, y_test)
print("Test Set Accuracy:", test_score)


Training Set Accuracy: 0.6672672672672673
Test Set Accuracy: 0.645083932853717


In [138]:
# Team with higher ICC ranking will be favourite to win a match against team with lower ICC ranking

ranking_df = pd.read_csv('./icc_rankings.csv') 
fixtures_df = pd.read_csv('./group_stage.csv')

In [139]:
# Adding ICC ranks for each team in the fixtures df

# Merge fixtures_df with ranking_df for 'Team_1' positions
fixtures_df = fixtures_df.merge(ranking_df[['Team', 'Position']], left_on='Team_1', right_on='Team', how='left')
fixtures_df.rename(columns={'Position': 'first_position'}, inplace=True)
fixtures_df.drop('Team', axis=1, inplace=True)

# Merge fixtures_df with ranking_df for 'Team_2' positions
fixtures_df = fixtures_df.merge(ranking_df[['Team', 'Position']], left_on='Team_2', right_on='Team', how='left')
fixtures_df.rename(columns={'Position': 'second_position'}, inplace=True)
fixtures_df.drop('Team', axis=1, inplace=True)


In [140]:
fixtures_df.tail()

Unnamed: 0,Round Number,Date,Location,Team_1,Team_2,Group,Result,first_position,second_position
40,1,"Nov 9, 2023",Bengaluru,New Zealand,Sri Lanka,Group A,,5.0,7.0
41,1,"Nov 10, 2023",Ahmedabad,Afghanistan,South Africa,Group A,,9.0,3.0
42,1,"Nov 11, 2023",Pune,Australia,Bangladesh,Group A,,2.0,8.0
43,1,"Nov 11, 2023",Eden Gardens,England,Pakistan,Group A,,6.0,4.0
44,1,"Nov 12, 2023",Bengaluru,India,Netherlands,Group A,,1.0,


In [141]:
grp_stage = []

# Loop to add teams to group stage dataset based on the ranking position of each team
for index, row in fixtures_df.iterrows():
    if row['first_position'] < row['second_position']:
        grp_stage.append({'Team_1': row['Team_1'], 'Team_2': row['Team_2'], 'winning_team': None})
    else:
        grp_stage.append({'Team_1': row['Team_2'], 'Team_2': row['Team_1'], 'winning_team': None})
        
grp_stage = pd.DataFrame(grp_stage)
backup_pred_set = grp_stage
grp_stage.head()

Unnamed: 0,Team_1,Team_2,winning_team
0,New Zealand,England,
1,Pakistan,Netherlands,
2,Bangladesh,Afghanistan,
3,South Africa,Sri Lanka,
4,India,Australia,


In [142]:
# Convert categorical variables to continuous inputs
grp_stage_encoded = pd.concat([pd.get_dummies(grp_stage['Team_1'], prefix='Team_1'),
                               pd.get_dummies(grp_stage['Team_2'], prefix='Team_2'),
                               grp_stage.drop(['Team_1', 'Team_2'], axis=1)], axis=1)

# Make sure that that group stage df and training set df has same columns
missing_cols = set(final_encoded_df.columns) - set(grp_stage.columns)
for c in missing_cols:
    grp_stage[c] = 0
grp_stage = grp_stage[final_encoded_df.columns]

grp_stage = grp_stage.drop(['Winner'], axis=1)

In [143]:
# Making the points table for 2023 World Cup

points_table_df = pd.DataFrame(columns=['team', 'matches_played', 'won', 'lost', 'points'])
points_table_df.set_index('team', inplace=True)

for team in worldcup_teams:
    points_table_df.loc[team] = [0, 0, 0, 0]
points_table_df

Unnamed: 0_level_0,matches_played,won,lost,points
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
England,0,0,0,0
South Africa,0,0,0,0
Netherlands,0,0,0,0
Pakistan,0,0,0,0
New Zealand,0,0,0,0
Sri Lanka,0,0,0,0
Afghanistan,0,0,0,0
Australia,0,0,0,0
Bangladesh,0,0,0,0
India,0,0,0,0


In [144]:
# Predicting group match results and updating points table

predictions = best_rf.predict(grp_stage)
for i in range(fixtures_df.shape[0]):
    team1 = backup_pred_set.iloc[i, 1]
    team2 = backup_pred_set.iloc[i, 0]

    points_table_df.loc[team1, 'matches_played'] += 1
    points_table_df.loc[team2, 'matches_played'] += 1
    
    print(team1 + " vs " + team2)
    if predictions[i] == 1:
        print("Winner: " + team1)
        points_table_df.loc[team1, 'won'] += 1
        points_table_df.loc[team2, 'lost'] += 1
        points_table_df.loc[team1, 'points'] += 2
    else:
        print("Winner: " + team2)
        points_table_df.loc[team2, 'won'] += 1
        points_table_df.loc[team1, 'lost'] += 1
        points_table_df.loc[team2, 'points'] += 2

    # print(points_table_df)
    print("")

England vs New Zealand
Winner: New Zealand

Netherlands vs Pakistan
Winner: Pakistan

Afghanistan vs Bangladesh
Winner: Bangladesh

Sri Lanka vs South Africa
Winner: South Africa

Australia vs India
Winner: India

Netherlands vs New Zealand
Winner: New Zealand

Bangladesh vs England
Winner: England

Sri Lanka vs Pakistan
Winner: Pakistan

Afghanistan vs India
Winner: India

South Africa vs Australia
Winner: Australia

Bangladesh vs New Zealand
Winner: New Zealand

Pakistan vs India
Winner: India

Afghanistan vs England
Winner: England

Sri Lanka vs Australia
Winner: Australia

Netherlands vs South Africa
Winner: South Africa

Afghanistan vs New Zealand
Winner: New Zealand

Bangladesh vs India
Winner: India

Pakistan vs Australia
Winner: Australia

Netherlands vs Sri Lanka
Winner: Sri Lanka

England vs South Africa
Winner: South Africa

New Zealand vs India
Winner: India

Afghanistan vs Pakistan
Winner: Pakistan

Bangladesh vs South Africa
Winner: South Africa

Australia vs Netherlands


In [145]:
points_table_df = points_table_df.sort_values(by='points', ascending=False)
print(points_table_df)

              matches_played  won  lost  points
team                                           
India                      9    8     1      16
South Africa               9    7     2      14
Australia                  9    7     2      14
Pakistan                   9    6     3      12
Netherlands                9    5     4      10
New Zealand                9    5     4      10
England                    9    3     6       6
Sri Lanka                  9    3     6       6
Bangladesh                 9    1     8       2
Afghanistan                9    0     9       0


In [146]:
# Getting the teams for semi finals
top_4_teams = points_table_df.head(4)
top_4_team_names = top_4_teams.index.tolist()

print(top_4_team_names)

['India', 'South Africa', 'Australia', 'Pakistan']


In [147]:
semi_finals = [(top_4_team_names[0], top_4_team_names[3]),
               (top_4_team_names[1], top_4_team_names[2])]

In [148]:
def predict_finals(matches, ranking_df, final_encoded_df, best_rf):

    pred = []
    for match in matches:
        dict = {}
        
        team1 = match[0]
        team2 = match[1]

        team1_pos = ranking_df.loc[ranking_df['Team'] == team1, 'Position'].iloc[0]
        team2_pos = ranking_df.loc[ranking_df['Team'] == team2, 'Position'].iloc[0]

        if team1_pos < team2_pos:
            dict['Team_1'] = team1
            dict['Team_2'] = team2
        else:
            dict['Team_1'] = team2
            dict['Team_2'] = team1

        pred.append(dict)

    # print(pred)

    pred = pd.DataFrame(pred)
    backup_pred_set = pred

    pred = pd.concat([pd.get_dummies(pred['Team_1'], prefix='Team_1'),
                    pd.get_dummies(pred['Team_2'], prefix='Team_2')], axis=1)
    

    # Make sure that that pred df and training set df has same columns
    missing_cols2 = set(final_encoded_df.columns) - set(pred.columns)
    for c in missing_cols2:
        pred[c] = 0
    pred = pred[final_encoded_df.columns]

    pred = pred.drop(['Winner'], axis=1)

    
    if len(pred) == 2 :
        print("---------SEMI FINALS-----------\n")
    else :
        print("----------FINALS---------\n")

    
    
    predictions = best_rf.predict(pred)
    winners = []
    for i in range(len(pred)):
        team1 = backup_pred_set.iloc[i, 1]
        team2 = backup_pred_set.iloc[i, 0]

        print(str(team1) + " vs " + str(team2))

        if predictions[i] == 1:
            print("Winner: " + str(team1))
            winners.append(str(team1))
        else:
            print("Winner: " + str(team2))
            winners.append(str(team2))
        print("")

    if len(winners) == 2 :
        return [(winners[0], winners[1])]
    else :
        return winners[0]

In [149]:
# Results of semi finals
finals = predict_finals(semi_finals, ranking_df, final_encoded_df, best_rf)

---------SEMI FINALS-----------

Pakistan vs India
Winner: India

South Africa vs Australia
Winner: Australia



In [150]:
# Teams in finals

print(finals)

[('India', 'Australia')]


In [151]:
# Result of the finals

wc_winner = predict_finals(finals, ranking_df, final_encoded_df, best_rf)

print(f"{wc_winner} will win the 2023 Cricket World Cup")

----------FINALS---------

Australia vs India
Winner: India

India will win the 2023 Cricket World Cup


In [152]:
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(wc_winner, pickle_out)

pickle_out.close()

In [153]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
Top4 = [top_4_team_names[0] , top_4_team_names[1]  , top_4_team_names[2]  , top_4_team_names[3] ]
batsmen = pd.read_csv('batsman_data.csv')
bowler_df = pd.read_csv('bowlers_data.csv')
print(batsmen)

                Player    Country              Opponent  Matches  Innings  \
0          Virat Kohli      India           Afghanistan        3        2   
1          Virat Kohli      India             Australia       48       46   
2          Virat Kohli      India            Bangladesh       16       16   
3          Virat Kohli      India               England       36       36   
4          Virat Kohli      India               Ireland        2        2   
...                ...        ...                   ...      ...      ...   
1119  Charith Asalanka  Sri Lanka          South Africa        4        4   
1120  Charith Asalanka  Sri Lanka  United Arab Emirates        1        1   
1121  Charith Asalanka  Sri Lanka           West Indies        1        0   
1122  Charith Asalanka  Sri Lanka              Zimbabwe        4        3   
1123  Charith Asalanka  Sri Lanka          Overall (15)       50       44   

      Not_Out  Hundreds  Fifties  Runs  Average  
0           1         0  

#### Here we are filtering all the batsman and bowlers from top 4 teams who have qualified for semi finals.

In [154]:

batsmen= batsmen[batsmen['Opponent'].isin(Top4)]
batsmen= batsmen[batsmen['Country'].isin(Top4)]
bowler_df= bowler_df[bowler_df['Opponent'].isin(Top4)]
bowler_df= bowler_df[bowler_df['Country'].isin(Top4)]

#### We have to convert string data to numeric value

In [155]:
batsmen['Average'] = pd.to_numeric(batsmen['Average'], errors='coerce')
batsmen['Matches'] = pd.to_numeric(batsmen['Matches'], errors='coerce')
batsmen['Innings'] = pd.to_numeric(batsmen['Innings'], errors='coerce')
batsmen['Hundreds'] = pd.to_numeric(batsmen['Hundreds'], errors='coerce')
batsmen['Fifties'] = pd.to_numeric(batsmen['Fifties'], errors='coerce')
batsmen['Runs'] = pd.to_numeric(batsmen['Runs'], errors='coerce')

In [156]:
from sklearn.preprocessing import LabelEncoder
label_encoder_players = LabelEncoder()
batsmen['Encoded_Player'] = label_encoder_players.fit_transform(batsmen['Player'])
label_encoder_teams = LabelEncoder()
label_encoder_teams.fit(Top4)
batsmen['Encoded_Country'] = label_encoder_teams.fit_transform(batsmen['Country'])
batsmen['Encoded_Opponent'] = label_encoder_teams.fit_transform(batsmen['Opponent'])  

#### We have considered that a team will have 5 batsman, 1 all-rounder and 5 bowlers.

In [157]:
Train_df = batsmen[['Encoded_Player' , 'Encoded_Country' , 'Encoded_Opponent' , 'Matches' , 'Runs']]
print(Train_df)

     Encoded_Player  Encoded_Country  Encoded_Opponent  Matches  Runs
1                32                1                 0       48  2313
8                32                1                 2       16   678
9                32                1                 3       31  1504
16               22                1                 0       44  2332
24               22                1                 2       19   873
..              ...              ...               ...      ...   ...
472              25                2                 1        8    60
476              25                2                 3       10   145
482              17                2                 0        2    15
485              17                2                 1        3    19
490              17                2                 3        2    28

[95 rows x 5 columns]


In [158]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [159]:
X = Train_df[['Encoded_Player' , 'Encoded_Country' , 'Encoded_Opponent' , 'Matches']]
y = Train_df[['Runs']]
X['Matches'] = pd.to_numeric(X['Matches'], errors='coerce')
y['Runs'] = pd.to_numeric(y['Runs'], errors='coerce')

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [161]:
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=0)
y_pred_dnn = model.predict(X_test)
r2_test = r2_score(y_test, y_pred_dnn)
print(f'R-squared on Test Set: {r2_test}')

R-squared on Test Set: 0.843822467201534


In [162]:
player_country = batsmen[['Player' , 'Encoded_Player' , 'Country' , 'Encoded_Country']]
player_country.drop_duplicates(inplace=True)
print(player_country)

                    Player  Encoded_Player       Country  Encoded_Country
1              Virat Kohli              32         India                1
16            Rohit Sharma              22         India                1
31            Shubman Gill              27         India                1
44            Shreyas Iyer              26         India                1
55        Suryakumar Yadav              29         India                1
65           Hardik Pandya              10         India                1
76         Ravindra Jadeja              20         India                1
91     Ravichandran Ashwin              19         India                1
104           Temba Bavuma              30  South Africa                3
115        Reeza Hendricks              21  South Africa                3
125          Aiden Markram               2  South Africa                3
138           David Miller               6  South Africa                3
152  Rassie van der Dussen            

In [163]:
# A helper function to get the encoded_country

def get_encoded_country(input_country):
    
    for index, row in player_country.iterrows():
        if(row['Country']==input_country) :
            return row['Encoded_Country']
    return 'Error'

#### We are calculating the total runs scored by a player against other 3 semi finals teams.

In [164]:
player_runs = {}
for index , row in player_country.iterrows() :
    player_runs[row['Player']] = 0

cnt = 0
for opponent in Top4 :
    for index , row in player_country.iterrows() :
        if(row['Country']!=opponent) :  
            
            runs = model.predict([[row['Encoded_Player'], row['Encoded_Country'], get_encoded_country(opponent), 1]])
            player_runs[row['Player']]+= (int(runs[0][0])) 



In [165]:
def get_player_country(Name) :
    for index , row in player_country.iterrows() :
        if(row['Player'] == Name):
            return row['Country']
    return 'None'

# Here we are storing the batsmans of each team.
IndianBatsman = []

from collections import OrderedDict
sorted_list = OrderedDict(sorted(player_runs.items(), key=lambda item: item[1], reverse=True))

print(top_4_team_names[0] + ' Batsman Potential Scorers')
print("")
for pl in sorted_list :
    if(get_player_country(pl)==top_4_team_names[0]):
        IndianBatsman.append(pl)
        print(f'{pl} : {sorted_list[pl]}')

India Batsman Potential Scorers

Virat Kohli : 189
Suryakumar Yadav : 170
Shubman Gill : 158
Shreyas Iyer : 152
Rohit Sharma : 126
Ravindra Jadeja : 114
Ravichandran Ashwin : 108
Hardik Pandya : 58


In [166]:
SouthAfricanBatsman = []

print(top_4_team_names[1] + ' Batsman Potential Scorers')
print("")
for pl in sorted_list :
    if(get_player_country(pl)==top_4_team_names[1] ):
        SouthAfricanBatsman.append(pl);
        print(f'{pl} : {sorted_list[pl]}')

South Africa Batsman Potential Scorers

Temba Bavuma : 139
Reeza Hendricks : 87
Rassie van der Dussen : 69
Marco Jansen : 49
Aiden Markram : 38
Andile Phehlukwayo : 32
David Miller : 30


In [167]:
AustraliaBatsman = []

print(top_4_team_names[2] + ' Batsman Potential Scorers')
print("")
for pl in sorted_list :
    if(get_player_country(pl)==top_4_team_names[2] ):
        AustraliaBatsman.append(pl)
        print(f'{pl} : {sorted_list[pl]}')

Australia Batsman Potential Scorers

Travis Head : 203
Steven Smith : 184
Sean Abbot : 159
Mitchel Marsh : 107
Marnus Labuschagne : 101
Marcus Stoinis : 95
Glenn Maxwell : 67
David Warner : 57
Cameroon Green : 50


In [168]:
# Getting Top Batsmen of Pakistan
PakistanBatsman = []

print(top_4_team_names[3] + ' Batsman Potential Scorers')
print("")
for pl in sorted_list :
    if(get_player_country(pl)==top_4_team_names[3] ):
        PakistanBatsman.append(pl)
        print(f'{pl} : {sorted_list[pl]}')

Pakistan Batsman Potential Scorers

Shadab Khan : 126
Saud Shakeel : 114
Mohammad Nawaz : 79
Abdullah Shafique : 59
Imam-ul-Haq : 53
Iftikhar Ahmed : 50
Agha Salman : 47
Fakhar Zaman : 38
Babar Azam : 32


In [169]:
bowler_df['Matches'] = pd.to_numeric(bowler_df['Matches'] , errors = 'coerce')
bowler_df['Overs'] = pd.to_numeric(bowler_df['Overs'] , errors = 'coerce')
bowler_df['Runs'] = pd.to_numeric(bowler_df['Runs'] , errors = 'coerce')
bowler_df['Maidens'] = pd.to_numeric(bowler_df['Maidens'] , errors = 'coerce')
bowler_df['Wickets'] = pd.to_numeric(bowler_df['Wickets'] , errors = 'coerce')

In [170]:
label_encoder_bowlers = LabelEncoder()
bowler_df['Encoded_Player'] = label_encoder_players.fit_transform(bowler_df['Player'])

In [171]:
bowler_df['Encoded_Country'] = label_encoder_teams.fit_transform(bowler_df['Country'])
bowler_df['Encoded_Opponent'] = label_encoder_teams.fit_transform(bowler_df['Opponent'])

In [172]:
Bowler_Train_df = bowler_df[['Encoded_Player' , 'Encoded_Country' , 'Encoded_Opponent' , 'Matches' , 'Wickets']]
print(Bowler_Train_df)

     Encoded_Player  Encoded_Country  Encoded_Opponent  Matches  Wickets
297              11                1                 0        5        3
303              11                1                 2        3        1
304              11                1                 3        6        8
310               1                1                 0       20       28
315               1                1                 2        8        7
316               1                1                 3       12       17
322               5                1                 0       21       31
329               5                1                 2        6       12
330               5                1                 3       11       26
336               8                1                 0       24       38
343               8                1                 2        3        5
344               8                1                 3        5       13
350               9                1               

In [173]:
X = Bowler_Train_df[['Encoded_Player' , 'Encoded_Country' , 'Encoded_Opponent' , 'Matches']]
y = Bowler_Train_df[['Wickets']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [174]:
model2 = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model2.compile(optimizer='adam', loss='mean_squared_error')
model2.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=0)
y_pred_dnn = model2.predict(X_test)
r2_test = r2_score(y_test, y_pred_dnn)
print(f'R-squared on Test Set: {r2_test}')

R-squared on Test Set: 0.9420674093030058


In [175]:
bowler_country = bowler_df[['Player' , 'Encoded_Player' , 'Country' , 'Encoded_Country']]
bowler_country.drop_duplicates(inplace=True)
print(bowler_country)

             Player  Encoded_Player       Country  Encoded_Country
297  Shardul Thakur              11         India                1
310  Jasprit Bumrah               1         India                1
322   Kuldeep Yadav               5         India                1
336  Mohammad Shami               8         India                1
350  Mohammad Siraj               9         India                1
363  Keshav Maharaj               4  South Africa                2
374     Luigi Ngidi               6  South Africa                2
387   Kagiso Rabada               3  South Africa                2
399  Tabraiz Shamsi              12  South Africa                2
414     Pat Cummins              10     Australia                0
427      Adam Zampa               0     Australia                0
440  Mitchell Starc               7     Australia                0
454  Josh Hazlewood               2     Australia                0


In [176]:
# A helper function to get the encoded_country

def get_bowler_country(Name):
    
    for index, row in bowler_country.iterrows():
        if(row['Player']==Name) :
            return row['Country']
    return 'Error'

In [177]:
player_wickets = {}
for index , row in bowler_country.iterrows() :
    player_wickets[row['Player']] = 0

for opponent in Top4 :
    for index , row in bowler_country.iterrows() :
        if(row['Country']!=opponent) :  
            
            wickets = model2.predict([[row['Encoded_Player'], row['Encoded_Country'], get_encoded_country(opponent), 1]])
            player_wickets[row['Player']]+= (int(wickets[0][0]))    



#### Now we are predicting wicket takers of each 4 top teams.

In [178]:
sorted_bowlers = OrderedDict(sorted(player_wickets.items(), key=lambda item: item[1], reverse=True))

# Getting top wicket takers of India
IndianBowlers = []

print(top_4_team_names[0] + " Potential wicket takers") 
# print('Player : Potential Wickets in playoffs')

for pl in sorted_bowlers :
    if(get_bowler_country(pl)==top_4_team_names[0]):
        IndianBowlers.append(pl)
        print(f'{pl} : {sorted_bowlers[pl]}')

India Potential wicket takers
Shardul Thakur : 8
Mohammad Shami : 7
Mohammad Siraj : 7
Jasprit Bumrah : 6
Kuldeep Yadav : 5


In [179]:
sorted_bowlers = OrderedDict(sorted(player_wickets.items(), key=lambda item: item[1], reverse=True))

# Getting top wicket takers of India
SouthAfricaBowlers = []

print(top_4_team_names[1] + " Potential wicket takers") 
# print('Player : Potential Wickets in playoffs')

for pl in sorted_bowlers :
    if(get_bowler_country(pl)==top_4_team_names[1]):
        SouthAfricaBowlers.append(pl)
        print(f'{pl} : {sorted_bowlers[pl]}')

South Africa Potential wicket takers
Tabraiz Shamsi : 9
Luigi Ngidi : 6
Keshav Maharaj : 5
Kagiso Rabada : 4


In [180]:
sorted_bowlers = OrderedDict(sorted(player_wickets.items(), key=lambda item: item[1], reverse=True))

# Getting top wicket takers of India
AustraliaBowlers = []

print(top_4_team_names[2] + " Potential wicket takers") 
# print('Player : Potential Wickets in playoffs')

for pl in sorted_bowlers :
    if(get_bowler_country(pl)==top_4_team_names[2]):
        AustraliaBowlers.append(pl)
        print(f'{pl} : {sorted_bowlers[pl]}')

Australia Potential wicket takers
Pat Cummins : 7
Adam Zampa : 7
Mitchell Starc : 6
Josh Hazlewood : 6


In [181]:
sorted_bowlers = OrderedDict(sorted(player_wickets.items(), key=lambda item: item[1], reverse=True))

# Getting top wicket takers of India


print(top_4_team_names[3] + " Potential wicket takers") 
# print('Player : Potential Wickets in playoffs')

for pl in sorted_bowlers :
    if(get_bowler_country(pl)==top_4_team_names[3]):
        print(f'{pl} : {sorted_bowlers[pl]}')

Pakistan Potential wicket takers


#### This are the playing 11 team players of finalist teams.

In [182]:
if(finals[0][0]=='India'):
    print('----------------Team India-----------------')
    cnt=0
    for i in range(len(IndianBatsman)):
        cnt+=1
        print(IndianBatsman[i])
        if(cnt>5):
            break
    
    for i in range(len(IndianBowlers)):
        print(IndianBowlers[i])

print("\n")

if(finals[0][1]=='Australia'):
    print('----------------Team Australia-----------------')
    cnt=0
    for i in range(len(AustraliaBatsman)):
        cnt+=1
        print(AustraliaBatsman[i])
        if(cnt>6):
            break
    
    for i in range(len(AustraliaBowlers)):
        print(AustraliaBowlers[i])

----------------Team India-----------------
Virat Kohli
Suryakumar Yadav
Shubman Gill
Shreyas Iyer
Rohit Sharma
Ravindra Jadeja
Shardul Thakur
Mohammad Shami
Mohammad Siraj
Jasprit Bumrah
Kuldeep Yadav


----------------Team Australia-----------------
Travis Head
Steven Smith
Sean Abbot
Mitchel Marsh
Marnus Labuschagne
Marcus Stoinis
Glenn Maxwell
Pat Cummins
Adam Zampa
Mitchell Starc
Josh Hazlewood
