# Final Notebook

Final notebook with the best model and features.

This notebook is the clean version of gomboski_notebok_features3.

In [1]:
import pandas as pd
import ast
import numpy as np

## Player Dataset

In [2]:
players_df = pd.read_csv('../raw_data/final_player_dataset.csv')

In [3]:
players_df.columns

Index(['player', 'bat_innings', 'total_runs', 'not_out', 'zero', '50s', '100s',
       'fours', 'sixes', 'high_score', 'balls_faced', 'average_score',
       'batting_average', 'batting_strike_rate', 'bowler_innings',
       'balls_bowled', 'runs_conceded', 'wickets', 'four_wickets',
       'five_wickets', 'bowling_average', 'bowling_economy_rate',
       'bowling_strike_rate', 'matches', 'wins', 'loses', 'win_ratio'],
      dtype='object')

## Team Players Columns

In [4]:
df = pd.read_csv('../raw_data/final_data.csv')

In [5]:
complete_df = pd.read_csv('../raw_data/complete_df.csv')

  complete_df = pd.read_csv('../raw_data/complete_df.csv')


In [6]:
complete_df_reduced = complete_df[['ID', 'Team1Players', 'Team2Players']]

In [7]:
# Convert strings in lists
complete_df['Team1Players'] = complete_df['Team1Players'].apply(lambda x: ast.literal_eval(x))
complete_df['Team2Players'] = complete_df['Team2Players'].apply(lambda x: ast.literal_eval(x))

In [8]:
# lower  case
complete_df['Team1Players'] = complete_df['Team1Players'].apply(lambda x: [name.lower() for name in x])
complete_df['Team2Players'] = complete_df['Team2Players'].apply(lambda x: [name.lower() for name in x])

In [9]:
complete_df_reduced = complete_df[['ID', 'Team1Players', 'Team2Players']]

In [10]:
complete_df_reduced = complete_df_reduced.drop_duplicates(subset='ID', keep='first')
complete_df_reduced = complete_df_reduced.reset_index(drop=True)

In [11]:
complete_df_reduced

Unnamed: 0,ID,Team1Players,Team2Players
0,1312200,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
1,1312199,"[v kohli, f du plessis, rm patidar, gj maxwell...","[ybk jaiswal, jc buttler, sv samson, d padikka..."
2,1312198,"[v kohli, f du plessis, rm patidar, gj maxwell...","[q de kock, kl rahul, m vohra, dj hooda, mp st..."
3,1312197,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
4,1304116,"[pk garg, abhishek sharma, ra tripathi, ak mar...","[jm bairstow, s dhawan, m shahrukh khan, ma ag..."
...,...,...,...
945,335986,"[wp saha, bb mccullum, rt ponting, sc ganguly,...","[ac gilchrist, y venugopal rao, vvs laxman, a ..."
946,335985,"[l ronchi, st jayasuriya, dj thornely, rv utha...","[s chanderpaul, r dravid, lrpl taylor, jh kall..."
947,335984,"[g gambhir, v sehwag, s dhawan, mk tiwary, kd ...","[t kohli, yk pathan, sr watson, m kaif, ds leh..."
948,335983,"[k goel, jr hopes, kc sangakkara, yuvraj singh...","[pa patel, ml hayden, mek hussey, ms dhoni, sk..."


## Merging Data

In [12]:
df = pd.merge(df, complete_df_reduced, on='ID', how='left')

In [13]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,...,TeamA_batting_average,TeamB_batting_average,TeamA_innings_total,TeamB_innings_total,Team1_points_against_avg,Team2_points_against_avg,Team1_MVP_average,Team2_MVP_average,Team1Players,Team2Players
0,1312200,ahmedabad,2022-05-29,2022,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,...,155.397906,166.4375,130.0,133.0,151.561798,167.666667,0.5,0.6875,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
1,1312199,ahmedabad,2022-05-27,2022,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,...,155.707965,155.397906,157.0,161.0,146.976,162.184466,0.49115,0.5,"[v kohli, f du plessis, rm patidar, gj maxwell...","[ybk jaiswal, jc buttler, sv samson, d padikka..."
2,1312198,kolkata,2022-05-25,2022,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,...,155.707965,169.866667,207.0,193.0,146.976,176.285714,0.49115,0.6,"[v kohli, f du plessis, rm patidar, gj maxwell...","[q de kock, kl rahul, m vohra, dj hooda, mp st..."
3,1312197,kolkata,2022-05-24,2022,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,...,155.397906,166.4375,188.0,191.0,151.561798,167.666667,0.5,0.6875,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
4,1304116,mumbai,2022-05-22,2022,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,...,155.546053,158.518349,157.0,160.0,145.594203,164.537037,0.486842,0.428571,"[pk garg, abhishek sharma, ra tripathi, ak mar...","[jm bairstow, s dhawan, m shahrukh khan, ma ag..."


## Creation of New Features

### Data

In [15]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players'],
      dtype='object')

In [16]:
players_df.columns

Index(['player', 'bat_innings', 'total_runs', 'not_out', 'zero', '50s', '100s',
       'fours', 'sixes', 'high_score', 'balls_faced', 'average_score',
       'batting_average', 'batting_strike_rate', 'bowler_innings',
       'balls_bowled', 'runs_conceded', 'wickets', 'four_wickets',
       'five_wickets', 'bowling_average', 'bowling_economy_rate',
       'bowling_strike_rate', 'matches', 'wins', 'loses', 'win_ratio'],
      dtype='object')

In [17]:
df.Team1Players

0      [ybk jaiswal, jc buttler, sv samson, d padikka...
1      [v kohli, f du plessis, rm patidar, gj maxwell...
2      [v kohli, f du plessis, rm patidar, gj maxwell...
3      [ybk jaiswal, jc buttler, sv samson, d padikka...
4      [pk garg, abhishek sharma, ra tripathi, ak mar...
                             ...                        
945    [wp saha, bb mccullum, rt ponting, sc ganguly,...
946    [l ronchi, st jayasuriya, dj thornely, rv utha...
947    [g gambhir, v sehwag, s dhawan, mk tiwary, kd ...
948    [k goel, jr hopes, kc sangakkara, yuvraj singh...
949    [r dravid, w jaffer, v kohli, jh kallis, cl wh...
Name: Team1Players, Length: 950, dtype: object

In [18]:
df.Team2Players

0      [wp saha, shubman gill, ms wade, hh pandya, da...
1      [ybk jaiswal, jc buttler, sv samson, d padikka...
2      [q de kock, kl rahul, m vohra, dj hooda, mp st...
3      [wp saha, shubman gill, ms wade, hh pandya, da...
4      [jm bairstow, s dhawan, m shahrukh khan, ma ag...
                             ...                        
945    [ac gilchrist, y venugopal rao, vvs laxman, a ...
946    [s chanderpaul, r dravid, lrpl taylor, jh kall...
947    [t kohli, yk pathan, sr watson, m kaif, ds leh...
948    [pa patel, ml hayden, mek hussey, ms dhoni, sk...
949    [sc ganguly, bb mccullum, rt ponting, dj husse...
Name: Team2Players, Length: 950, dtype: object

### Batting Features

In [19]:
def weighted_average_score(players_list, players_df):
    # Filters the player information in the DataFrame based on the names of the players in the list
    player_info = players_df[players_df['player'].isin(players_list)]

    # Calculates the weight for each player based on the average_score (for example, using the square root function)

    #square root function: 
    player_info['weight'] = np.sqrt(player_info['average_score'])

    # Calculates the weighted average using the weights
    weighted_avg = (players_df['average_score'] * player_info['weight']).sum() / player_info['weight'].sum()
    
    return weighted_avg

# Applies the function to calculate Team1 and Team2 weighted average
df['Avg_Weighted_Score_Team1'] = df['Team1Players'].apply(lambda players: weighted_average_score(players, players_df))
df['Avg_Weighted_Score_Team2'] = df['Team2Players'].apply(lambda players: weighted_average_score(players, players_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying t

In [20]:
def weighted_batting_average(players_list, players_df):
    
    player_info = players_df[players_df['player'].isin(players_list)]

    player_info['weight'] = np.sqrt(player_info['average_score'])

    weighted_avg = (players_df['batting_average'] * player_info['weight']).sum() / player_info['weight'].sum()
    
    return weighted_avg

df['batting_average_PlayersTeam1_weighted'] = df['Team1Players'].apply(lambda players: weighted_batting_average(players, players_df))
df['batting_average_PlayersTeam2_weighted'] = df['Team2Players'].apply(lambda players: weighted_batting_average(players, players_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying t

In [21]:
def weighted_batting_strike_rate(players_list, players_df):
    
    player_info = players_df[players_df['player'].isin(players_list)]

    player_info['weight'] = np.sqrt(player_info['average_score'])

    weighted_avg = (players_df['batting_strike_rate'] * player_info['weight']).sum() / player_info['weight'].sum()
    
    return weighted_avg

df['batting_strike_rate_PlayersTeam1_weighted'] = df['Team1Players'].apply(lambda players: weighted_batting_strike_rate(players, players_df))
df['batting_strike_rate_PlayersTeam2_weighted'] = df['Team2Players'].apply(lambda players: weighted_batting_strike_rate(players, players_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_info['weight'] = np.sqrt(player_info['average_score'])
A value is trying t

### Bowling Features

In [22]:
df['bowling_average_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_average'].mean())
df['bowling_average_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_average'].mean())

In [23]:
df['bowling_economy_rate_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_economy_rate'].mean())
df['bowling_economy_rate_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_economy_rate'].mean())

In [24]:
df['bowling_strike_rate_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_strike_rate'].mean())
df['bowling_strike_rate_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_strike_rate'].mean())

In [25]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players', 'Avg_Weighted_Score_Team1',
       'Avg_Weighted_Score_Team2', 'batting_average_PlayersTeam1_weighted',
       'batting_average_PlayersTeam2_weighted',
       'batting_strike_rate_PlayersTeam1_weighted',
       'batting_strike_rate_PlayersTeam2_weighted',
       'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2',
       'bowling_economy_rate_PlayersTeam1',
       'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
       'bowling_strike_rate_PlayersTeam2'],
      dtype='object')

### Win Ratio Feature

In [26]:
df['win_ratio_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['win_ratio'].mean())
df['win_ratio_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['win_ratio'].mean())

In [27]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players', 'Avg_Weighted_Score_Team1',
       'Avg_Weighted_Score_Team2', 'batting_average_PlayersTeam1_weighted',
       'batting_average_PlayersTeam2_weighted',
       'batting_strike_rate_PlayersTeam1_weighted',
       'batting_strike_rate_PlayersTeam2_weighted',
       'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2',
       'bowling_economy_rate_PlayersTeam1',
       'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
       'bowling_strike_rate_PlayersTeam2', 'win_ratio_PlayersTeam1',
       'win_ratio_PlayersTeam2'],
      dtype='object')

### Differences Team1 and Team2

In [28]:
df['Avg_Weighted_Score_diff'] = df['Avg_Weighted_Score_Team1'] - df['Avg_Weighted_Score_Team2']

In [29]:
df['batting_average_weighted_diff'] = df['batting_average_PlayersTeam1_weighted'] - df['batting_average_PlayersTeam2_weighted']

In [30]:
df['batting_strike_rate_weighted_diff'] = df['batting_strike_rate_PlayersTeam1_weighted'] - df['batting_strike_rate_PlayersTeam2_weighted']

In [31]:
df['bowling_average_diff'] = df['bowling_average_PlayersTeam1'] - df['bowling_average_PlayersTeam2']

In [32]:
df['bowling_economy_rate_diff'] = df['bowling_economy_rate_PlayersTeam1'] - df['bowling_economy_rate_PlayersTeam2']

In [33]:
df['bowling_strike_rate_diff'] = df['bowling_strike_rate_PlayersTeam1'] - df['bowling_strike_rate_PlayersTeam2']

In [34]:
df['win_ratio_diff'] = df['win_ratio_PlayersTeam1'] - df['win_ratio_PlayersTeam2']

## Preprocessing

In [35]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [36]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players', 'Avg_Weighted_Score_Team1',
       'Avg_Weighted_Score_Team2', 'batting_average_PlayersTeam1_weighted',
       'batting_average_PlayersTeam2_weighted',
       'batting_strike_rate_PlayersTeam1_weighted',
       'batting_strike_rate_PlayersTeam2_weighted',
       'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2',
       'bowling_economy_rate_PlayersTeam1',
       'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
       'bowling_strike_rate_PlayersTeam2', 'win_ratio_PlayersTeam1',
       'win_ratio_PlayersTeam2', 'Avg_Weighted_Score_diff',
       'batting_

### Rename Columns

In [37]:
# rename columns TeamA and TeamB to Team1 and Team2
df = df.rename({'TeamA_batting_average': 'Team1_batting_average',
           'TeamB_batting_average': 'Team2_batting_average',
           'TeamA_innings_total': 'Team1_innings_total',
           'TeamB_innings_total' : 'Team2_innings_total'
          }, axis=1)

In [38]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'Team1_batting_average', 'Team2_batting_average', 'Team1_innings_total',
       'Team2_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players', 'Avg_Weighted_Score_Team1',
       'Avg_Weighted_Score_Team2', 'batting_average_PlayersTeam1_weighted',
       'batting_average_PlayersTeam2_weighted',
       'batting_strike_rate_PlayersTeam1_weighted',
       'batting_strike_rate_PlayersTeam2_weighted',
       'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2',
       'bowling_economy_rate_PlayersTeam1',
       'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
       'bowling_strike_rate_PlayersTeam2', 'win_ratio_PlayersTeam1',
       'win_ratio_PlayersTeam2', 'Avg_Weighted_Score_diff',
       'batting_

### Drop Columns

In [39]:
columns_to_delete = ['ID', 'Date', 'innings_total', 'Team1_innings_total', 'Team2_innings_total', 'Team1Players', 'Team2Players',
                    'Team1_batting_average', 'Team2_batting_average', 'Team1_points_against_avg', 'Team2_points_against_avg',
                    'Team1_MVP_average', 'Team2_MVP_average', 'Avg_Weighted_Score_Team1', 'Avg_Weighted_Score_Team2',
                    'batting_average_PlayersTeam1_weighted', 'batting_average_PlayersTeam2_weighted',
                    'batting_strike_rate_PlayersTeam1_weighted', 'batting_strike_rate_PlayersTeam2_weighted',
                    'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2', 'bowling_economy_rate_PlayersTeam1',
                    'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
                    'bowling_strike_rate_PlayersTeam2', 'win_ratio_PlayersTeam1', 'win_ratio_PlayersTeam2', 'bowling_strike_rate_diff',
                    'MatchNumber']

In [40]:
df.drop(columns=columns_to_delete, axis=1, inplace=True)

In [41]:
df.columns

Index(['City', 'Season', 'Team1', 'Team2', 'Venue', 'TossWinner',
       'TossDecision', 'WinningTeam', 'Avg_Weighted_Score_diff',
       'batting_average_weighted_diff', 'batting_strike_rate_weighted_diff',
       'bowling_average_diff', 'bowling_economy_rate_diff', 'win_ratio_diff'],
      dtype='object')

### Encoding

#### Toss Decision

In [42]:
# model
ohe = OneHotEncoder(sparse_output=False)

In [43]:
ohe.fit(df[['TossDecision']])

In [44]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['TossDecision']])

In [45]:
df.drop(columns = ["TossDecision"], inplace = True)

#### City

In [46]:
# model
ohe = OneHotEncoder(sparse_output=False)

In [47]:
ohe.fit(df[['City']])

In [48]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['City']])

In [49]:
df.drop(columns = ["City"], inplace = True)

#### Winning Team

In [50]:
# WinningTeam = 1 --> Team1 Won

In [51]:
# function
def map_winning_team(row):
    if row['WinningTeam'] == row['Team1']:
        return 1
    elif row['WinningTeam'] == row['Team2']:
        return 0
    else:
        return -1

In [52]:
df['WinningTeam'] = df.apply(map_winning_team, axis=1)

In [53]:
# drop rows with Winning Team = -1
df = df.drop(df[df['WinningTeam'] == -1].index)

#### Venue

In [54]:
ohe = OneHotEncoder(sparse_output=False)

In [55]:
ohe.fit(df[['Venue']])

In [56]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['Venue']])

In [57]:
df.drop(columns = ["Venue"], inplace = True)

#### Toss Winner

In [58]:
# Toss Winner --> 0 = away team ; 1 = home team

In [59]:
def map_toss_winner(row):
    if row['TossWinner'] == row['Team1']:
        return 1
    elif row['TossWinner'] == row['Team2']:
        return 0
    else:
        return -1

In [60]:
df['TossWinner'] = df.apply(map_toss_winner, axis=1)

#### Team1 and Team2

In [61]:
#hot encoder -- team 1
ohe = OneHotEncoder(sparse_output=False)

In [62]:
ohe.fit(df[['Team1']])

In [63]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['Team1']])

In [64]:
df.drop(columns = ["Team1"], inplace = True)

In [65]:
# team2
ohe = OneHotEncoder(sparse_output=False)

In [66]:
ohe.fit(df[['Team2']])

In [67]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])

  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])


In [68]:
df.drop(columns = ["Team2"], inplace = True)

#### Season

In [69]:
label_encoder = LabelEncoder()

In [70]:
df['Season'] = label_encoder.fit_transform(df['Season'])

### Scaler

In [71]:
df.columns

Index(['Season', 'TossWinner', 'WinningTeam', 'Avg_Weighted_Score_diff',
       'batting_average_weighted_diff', 'batting_strike_rate_weighted_diff',
       'bowling_average_diff', 'bowling_economy_rate_diff', 'win_ratio_diff',
       'TossDecision_bat',
       ...
       'Team2_kochi tuskers kerala', 'Team2_kolkata knight riders',
       'Team2_lucknow super giants', 'Team2_mumbai indians',
       'Team2_pune warriors', 'Team2_punjab kings', 'Team2_rajasthan royals',
       'Team2_rising pune supergiant', 'Team2_royal challengers bangalore',
       'Team2_sunrisers hyderabad'],
      dtype='object', length=110)

In [72]:
columns_scaler = ['Avg_Weighted_Score_diff', 'batting_average_weighted_diff', 
                  'batting_strike_rate_weighted_diff', 'bowling_average_diff', 'bowling_economy_rate_diff',
                  'win_ratio_diff']

In [73]:
# Robust Scaler
scaler = RobustScaler()

In [74]:
# fit
scaler.fit(df[columns_scaler])

In [75]:
df[columns_scaler] = scaler.transform(df[columns_scaler]) 

## XG Boost Model

In [76]:
import xgboost as xgb

In [77]:
X = df.drop('WinningTeam', axis=1) 
y = df['WinningTeam']

In [78]:
# split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
# model
grid = {'max_depth': 5,
       'base_score':0.5,
       'booster': 'gblinear',
        'gamma' : 0.1,
        'n_estimators': 150,
        'learning_rate': 0.1,
        'reg_lambda': 0.3,
       }


model = xgb.XGBClassifier(**grid)

In [80]:
# fit the model
model.fit(X_train, y_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Parameters: { "gamma", "max_depth" } are not used.



In [81]:
y_pred = model.predict(X_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [82]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Best model Accuracy: {accuracy}")

Best model Accuracy: 0.6736842105263158
