# Final Notebook

Final notebook with the best model and features.

This notebook is the clean version of gomboski_notebok_features3.

In [72]:
import pandas as pd
import ast
import numpy as np

## Player Dataset

In [73]:
players_df = pd.read_csv('../raw_data/final_player_dataset.csv')

In [74]:
players_df.columns

Index(['player', 'bat_innings', 'total_runs', 'not_out', 'zero', '50s', '100s',
       'fours', 'sixes', 'high_score', 'balls_faced', 'average_score',
       'batting_average', 'batting_strike_rate', 'bowler_innings',
       'balls_bowled', 'runs_conceded', 'wickets', 'four_wickets',
       'five_wickets', 'bowling_average', 'bowling_economy_rate',
       'bowling_strike_rate', 'matches', 'wins', 'loses', 'win_ratio'],
      dtype='object')

## Team Players Columns

In [75]:
df = pd.read_csv('../raw_data/final_data.csv')

In [76]:
df.columns

Index(['Unnamed: 0', 'ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1',
       'Team2', 'Venue', 'TossWinner', 'TossDecision', 'WinningTeam',
       'innings_total', 'TeamA_batting_average', 'TeamB_batting_average',
       'TeamA_innings_total', 'TeamB_innings_total',
       'Team1_points_against_avg', 'Team2_points_against_avg',
       'Team1_MVP_average', 'Team2_MVP_average'],
      dtype='object')

In [77]:
complete_df = pd.read_csv('../raw_data/complete_df.csv', low_memory=False)

In [78]:
complete_df.columns

Index(['ID', 'innings', 'overs', 'ballnumber', 'batter', 'bowler',
       'non-striker', 'extra_type', 'batsman_run', 'extras_run', 'total_run',
       'non_boundary', 'isWicketDelivery', 'player_out', 'kind',
       'fielders_involved', 'BattingTeam', 'innings_total', 'City', 'Date',
       'Season', 'MatchNumber', 'Team1', 'Team2', 'Venue', 'TossWinner',
       'TossDecision', 'SuperOver', 'WinningTeam', 'WonBy', 'Margin', 'method',
       'Player_of_Match', 'Team1Players', 'Team2Players', 'Umpire1',
       'Umpire2'],
      dtype='object')

In [80]:
df.City.unique()

array(['ahmedabad', 'kolkata', 'mumbai', 'navi mumbai', 'pune', 'dubai',
       'sharjah', 'abu dhabi', 'delhi', 'chennai', 'hyderabad',
       'visakhapatnam', 'chandigarh', 'bengaluru', 'jaipur', 'indore',
       'bangalore', 'kanpur', 'rajkot', 'raipur', 'ranchi', 'cuttack',
       'dharamsala', 'kochi', 'nagpur', 'johannesburg', 'centurion',
       'durban', 'bloemfontein', 'port elizabeth', 'kimberley',
       'east london', 'cape town'], dtype=object)

In [8]:
complete_df_reduced = complete_df[['ID', 'Team1Players', 'Team2Players']]

# Convert strings in lists
complete_df['Team1Players'] = complete_df['Team1Players'].apply(lambda x: ast.literal_eval(x))
complete_df['Team2Players'] = complete_df['Team2Players'].apply(lambda x: ast.literal_eval(x))

# lower  case
complete_df['Team1Players'] = complete_df['Team1Players'].apply(lambda x: [name.lower() for name in x])
complete_df['Team2Players'] = complete_df['Team2Players'].apply(lambda x: [name.lower() for name in x])

complete_df_reduced = complete_df[['ID', 'Team1Players', 'Team2Players']]

complete_df_reduced = complete_df_reduced.drop_duplicates(subset='ID', keep='first')
complete_df_reduced = complete_df_reduced.reset_index(drop=True)

In [9]:
complete_df_reduced

Unnamed: 0,ID,Team1Players,Team2Players
0,1312200,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
1,1312199,"[v kohli, f du plessis, rm patidar, gj maxwell...","[ybk jaiswal, jc buttler, sv samson, d padikka..."
2,1312198,"[v kohli, f du plessis, rm patidar, gj maxwell...","[q de kock, kl rahul, m vohra, dj hooda, mp st..."
3,1312197,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
4,1304116,"[pk garg, abhishek sharma, ra tripathi, ak mar...","[jm bairstow, s dhawan, m shahrukh khan, ma ag..."
...,...,...,...
945,335986,"[wp saha, bb mccullum, rt ponting, sc ganguly,...","[ac gilchrist, y venugopal rao, vvs laxman, a ..."
946,335985,"[l ronchi, st jayasuriya, dj thornely, rv utha...","[s chanderpaul, r dravid, lrpl taylor, jh kall..."
947,335984,"[g gambhir, v sehwag, s dhawan, mk tiwary, kd ...","[t kohli, yk pathan, sr watson, m kaif, ds leh..."
948,335983,"[k goel, jr hopes, kc sangakkara, yuvraj singh...","[pa patel, ml hayden, mek hussey, ms dhoni, sk..."


## Merging Data

In [10]:
df = pd.merge(df, complete_df_reduced, on='ID', how='left')

In [11]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
df.shape

(950, 22)

In [13]:
df.head()

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,...,TeamA_batting_average,TeamB_batting_average,TeamA_innings_total,TeamB_innings_total,Team1_points_against_avg,Team2_points_against_avg,Team1_MVP_average,Team2_MVP_average,Team1Players,Team2Players
0,1312200,ahmedabad,2022-05-29,2022,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,...,155.397906,166.4375,130.0,133.0,151.561798,167.666667,0.5,0.6875,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
1,1312199,ahmedabad,2022-05-27,2022,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,...,155.707965,155.397906,157.0,161.0,146.976,162.184466,0.49115,0.5,"[v kohli, f du plessis, rm patidar, gj maxwell...","[ybk jaiswal, jc buttler, sv samson, d padikka..."
2,1312198,kolkata,2022-05-25,2022,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,...,155.707965,169.866667,207.0,193.0,146.976,176.285714,0.49115,0.6,"[v kohli, f du plessis, rm patidar, gj maxwell...","[q de kock, kl rahul, m vohra, dj hooda, mp st..."
3,1312197,kolkata,2022-05-24,2022,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,...,155.397906,166.4375,188.0,191.0,151.561798,167.666667,0.5,0.6875,"[ybk jaiswal, jc buttler, sv samson, d padikka...","[wp saha, shubman gill, ms wade, hh pandya, da..."
4,1304116,mumbai,2022-05-22,2022,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,...,155.546053,158.518349,157.0,160.0,145.594203,164.537037,0.486842,0.428571,"[pk garg, abhishek sharma, ra tripathi, ak mar...","[jm bairstow, s dhawan, m shahrukh khan, ma ag..."


## Creation of New Features

### Data

In [14]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players'],
      dtype='object')

In [15]:
players_df.columns

Index(['player', 'bat_innings', 'total_runs', 'not_out', 'zero', '50s', '100s',
       'fours', 'sixes', 'high_score', 'balls_faced', 'average_score',
       'batting_average', 'batting_strike_rate', 'bowler_innings',
       'balls_bowled', 'runs_conceded', 'wickets', 'four_wickets',
       'five_wickets', 'bowling_average', 'bowling_economy_rate',
       'bowling_strike_rate', 'matches', 'wins', 'loses', 'win_ratio'],
      dtype='object')

In [16]:
df.Team1Players

0      [ybk jaiswal, jc buttler, sv samson, d padikka...
1      [v kohli, f du plessis, rm patidar, gj maxwell...
2      [v kohli, f du plessis, rm patidar, gj maxwell...
3      [ybk jaiswal, jc buttler, sv samson, d padikka...
4      [pk garg, abhishek sharma, ra tripathi, ak mar...
                             ...                        
945    [wp saha, bb mccullum, rt ponting, sc ganguly,...
946    [l ronchi, st jayasuriya, dj thornely, rv utha...
947    [g gambhir, v sehwag, s dhawan, mk tiwary, kd ...
948    [k goel, jr hopes, kc sangakkara, yuvraj singh...
949    [r dravid, w jaffer, v kohli, jh kallis, cl wh...
Name: Team1Players, Length: 950, dtype: object

In [17]:
df.Team2Players

0      [wp saha, shubman gill, ms wade, hh pandya, da...
1      [ybk jaiswal, jc buttler, sv samson, d padikka...
2      [q de kock, kl rahul, m vohra, dj hooda, mp st...
3      [wp saha, shubman gill, ms wade, hh pandya, da...
4      [jm bairstow, s dhawan, m shahrukh khan, ma ag...
                             ...                        
945    [ac gilchrist, y venugopal rao, vvs laxman, a ...
946    [s chanderpaul, r dravid, lrpl taylor, jh kall...
947    [t kohli, yk pathan, sr watson, m kaif, ds leh...
948    [pa patel, ml hayden, mek hussey, ms dhoni, sk...
949    [sc ganguly, bb mccullum, rt ponting, dj husse...
Name: Team2Players, Length: 950, dtype: object

### Batting Features

In [18]:
def weighted_average_score(players_list, players_df):
    # Filters the player information in the DataFrame based on the names of the players in the list
    player_info = players_df[players_df['player'].isin(players_list)].copy()

    # Calculates the weight for each player based on the average_score (for example, using the square root function)

    #square root function: 
    player_info['weight'] = np.sqrt(player_info['average_score'])

    # Calculates the weighted average using the weights
    weighted_avg = (players_df['average_score'] * player_info['weight']).sum() / player_info['weight'].sum()
    
    return weighted_avg

# Applies the function to calculate Team1 and Team2 weighted average
df['Avg_Weighted_Score_Team1'] = df['Team1Players'].apply(lambda players: weighted_average_score(players, players_df))
df['Avg_Weighted_Score_Team2'] = df['Team2Players'].apply(lambda players: weighted_average_score(players, players_df))

def weighted_batting_average(players_list, players_df):
    
    player_info = players_df[players_df['player'].isin(players_list)].copy()

    player_info['weight'] = np.sqrt(player_info['average_score'])

    weighted_avg = (players_df['batting_average'] * player_info['weight']).sum() / player_info['weight'].sum()
    
    return weighted_avg

df['batting_average_PlayersTeam1_weighted'] = df['Team1Players'].apply(lambda players: weighted_batting_average(players, players_df))
df['batting_average_PlayersTeam2_weighted'] = df['Team2Players'].apply(lambda players: weighted_batting_average(players, players_df))


def weighted_batting_strike_rate(players_list, players_df):
    
    player_info = players_df[players_df['player'].isin(players_list)].copy()

    player_info['weight'] = np.sqrt(player_info['average_score'])

    weighted_avg = (players_df['batting_strike_rate'] * player_info['weight']).sum() / player_info['weight'].sum()
    
    return weighted_avg

df['batting_strike_rate_PlayersTeam1_weighted'] = df['Team1Players'].apply(lambda players: weighted_batting_strike_rate(players, players_df))
df['batting_strike_rate_PlayersTeam2_weighted'] = df['Team2Players'].apply(lambda players: weighted_batting_strike_rate(players, players_df))
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players', 'Avg_Weighted_Score_Team1',
       'Avg_Weighted_Score_Team2', 'batting_average_PlayersTeam1_weighted',
       'batting_average_PlayersTeam2_weighted',
       'batting_strike_rate_PlayersTeam1_weighted',
       'batting_strike_rate_PlayersTeam2_weighted'],
      dtype='object')

### Bowling Features

In [19]:
df['bowling_average_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_average'].mean())
df['bowling_average_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_average'].mean())

df['bowling_economy_rate_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_economy_rate'].mean())
df['bowling_economy_rate_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_economy_rate'].mean())

df['bowling_strike_rate_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_strike_rate'].mean())
df['bowling_strike_rate_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['bowling_strike_rate'].mean())



### Win Ratio Feature

In [20]:
df['win_ratio_PlayersTeam1'] = df['Team1Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['win_ratio'].mean())
df['win_ratio_PlayersTeam2'] = df['Team2Players'].apply(lambda players: players_df[players_df['player'].isin(players)]['win_ratio'].mean())



### Differences Team1 and Team2

In [21]:
df['Avg_Weighted_Score_diff'] = df['Avg_Weighted_Score_Team1'] - df['Avg_Weighted_Score_Team2']

df['batting_average_weighted_diff'] = df['batting_average_PlayersTeam1_weighted'] - df['batting_average_PlayersTeam2_weighted']

df['batting_strike_rate_weighted_diff'] = df['batting_strike_rate_PlayersTeam1_weighted'] - df['batting_strike_rate_PlayersTeam2_weighted']

df['bowling_average_diff'] = df['bowling_average_PlayersTeam1'] - df['bowling_average_PlayersTeam2']

df['bowling_economy_rate_diff'] = df['bowling_economy_rate_PlayersTeam1'] - df['bowling_economy_rate_PlayersTeam2']

df['bowling_strike_rate_diff'] = df['bowling_strike_rate_PlayersTeam1'] - df['bowling_strike_rate_PlayersTeam2']

df['win_ratio_diff'] = df['win_ratio_PlayersTeam1'] - df['win_ratio_PlayersTeam2']

In [22]:
df.head()


Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,...,bowling_strike_rate_PlayersTeam2,win_ratio_PlayersTeam1,win_ratio_PlayersTeam2,Avg_Weighted_Score_diff,batting_average_weighted_diff,batting_strike_rate_weighted_diff,bowling_average_diff,bowling_economy_rate_diff,bowling_strike_rate_diff,win_ratio_diff
0,1312200,ahmedabad,2022-05-29,2022,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,...,12.818182,0.531818,0.613636,2.561112,-4.967742,-0.751674,1.17,2.803636,-0.272727,-0.081818
1,1312199,ahmedabad,2022-05-27,2022,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,...,12.545455,0.514545,0.531818,1.476839,1.747039,-3.646447,13.921818,-0.486364,10.986364,-0.017273
2,1312198,kolkata,2022-05-25,2022,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,...,15.052727,0.514545,0.530909,0.779066,-1.034351,-8.422789,12.004545,2.373636,8.479091,-0.016364
3,1312197,kolkata,2022-05-24,2022,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,...,12.603636,0.531818,0.624545,2.601865,-0.111153,2.842541,1.291818,2.742727,-0.058182,-0.092727
4,1304116,mumbai,2022-05-22,2022,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,...,11.501818,0.41,0.532727,-2.409256,-1.564914,-19.543552,12.986364,2.799091,8.746364,-0.122727


## Preprocessing

In [23]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [24]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players', 'Avg_Weighted_Score_Team1',
       'Avg_Weighted_Score_Team2', 'batting_average_PlayersTeam1_weighted',
       'batting_average_PlayersTeam2_weighted',
       'batting_strike_rate_PlayersTeam1_weighted',
       'batting_strike_rate_PlayersTeam2_weighted',
       'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2',
       'bowling_economy_rate_PlayersTeam1',
       'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
       'bowling_strike_rate_PlayersTeam2', 'win_ratio_PlayersTeam1',
       'win_ratio_PlayersTeam2', 'Avg_Weighted_Score_diff',
       'batting_

### Rename Columns

In [25]:
# rename columns TeamA and TeamB to Team1 and Team2
df = df.rename({'TeamA_batting_average': 'Team1_batting_average',
           'TeamB_batting_average': 'Team2_batting_average',
           'TeamA_innings_total': 'Team1_innings_total',
           'TeamB_innings_total' : 'Team2_innings_total'
          }, axis=1)

In [26]:
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'Team1_batting_average', 'Team2_batting_average', 'Team1_innings_total',
       'Team2_innings_total', 'Team1_points_against_avg',
       'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average',
       'Team1Players', 'Team2Players', 'Avg_Weighted_Score_Team1',
       'Avg_Weighted_Score_Team2', 'batting_average_PlayersTeam1_weighted',
       'batting_average_PlayersTeam2_weighted',
       'batting_strike_rate_PlayersTeam1_weighted',
       'batting_strike_rate_PlayersTeam2_weighted',
       'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2',
       'bowling_economy_rate_PlayersTeam1',
       'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
       'bowling_strike_rate_PlayersTeam2', 'win_ratio_PlayersTeam1',
       'win_ratio_PlayersTeam2', 'Avg_Weighted_Score_diff',
       'batting_

### Drop Columns

In [27]:
columns_to_delete = ['ID', 'Date', 'innings_total', 'Team1_innings_total', 'Team2_innings_total', 'Team1Players', 'Team2Players',
                    'Team1_batting_average', 'Team2_batting_average', 'Team1_points_against_avg', 'Team2_points_against_avg',
                    'Team1_MVP_average', 'Team2_MVP_average', 'Avg_Weighted_Score_Team1', 'Avg_Weighted_Score_Team2',
                    'batting_average_PlayersTeam1_weighted', 'batting_average_PlayersTeam2_weighted',
                    'batting_strike_rate_PlayersTeam1_weighted', 'batting_strike_rate_PlayersTeam2_weighted',
                    'bowling_average_PlayersTeam1', 'bowling_average_PlayersTeam2', 'bowling_economy_rate_PlayersTeam1',
                    'bowling_economy_rate_PlayersTeam2', 'bowling_strike_rate_PlayersTeam1',
                    'bowling_strike_rate_PlayersTeam2', 'win_ratio_PlayersTeam1', 'win_ratio_PlayersTeam2', 'bowling_strike_rate_diff',
                    'MatchNumber']

In [28]:
df.drop(columns=columns_to_delete, axis=1, inplace=True)

In [29]:
df.columns

Index(['City', 'Season', 'Team1', 'Team2', 'Venue', 'TossWinner',
       'TossDecision', 'WinningTeam', 'Avg_Weighted_Score_diff',
       'batting_average_weighted_diff', 'batting_strike_rate_weighted_diff',
       'bowling_average_diff', 'bowling_economy_rate_diff', 'win_ratio_diff'],
      dtype='object')

In [30]:
df.WinningTeam

0                   gujarat titans
1                 rajasthan royals
2      royal challengers bangalore
3                   gujarat titans
4                     punjab kings
                  ...             
945          kolkata knight riders
946    royal challengers bangalore
947                 delhi capitals
948            chennai super kings
949          kolkata knight riders
Name: WinningTeam, Length: 950, dtype: object

### Encoding

#### Toss Decision

In [31]:
# model
ohe = OneHotEncoder(sparse_output=False)

In [32]:
ohe.fit(df[['TossDecision']])

In [33]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['TossDecision']])

In [34]:
df.drop(columns = ["TossDecision"], inplace = True)

#### City

In [35]:
# model
ohe = OneHotEncoder(sparse_output=False)

In [36]:
ohe.fit(df[['City']])

In [37]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['City']])

In [38]:
df.drop(columns = ["City"], inplace = True)

#### Winning Team

In [39]:
# WinningTeam = 1 --> Team1 Won

# function
def map_winning_team(row):
    if row['WinningTeam'] == row['Team1']:
        return 1
    elif row['WinningTeam'] == row['Team2']:
        return 0
    else:
        return -1

df['WinningTeam'] = df.apply(map_winning_team, axis=1)

# drop rows with Winning Team = -1
df = df.drop(df[df['WinningTeam'] == -1].index)

#### Venue

In [40]:
ohe = OneHotEncoder(sparse_output=False)

In [41]:
ohe.fit(df[['Venue']])

In [42]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['Venue']])

In [43]:
df.drop(columns = ["Venue"], inplace = True)

#### Toss Winner

In [44]:
# Toss Winner --> 0 = away team ; 1 = home team

def map_toss_winner(row):
    if row['TossWinner'] == row['Team1']:
        return 1
    elif row['TossWinner'] == row['Team2']:
        return 0
    else:
        return -1

df['TossWinner'] = df.apply(map_toss_winner, axis=1)

#### Team1 and Team2

In [45]:
#hot encoder -- team 1
ohe = OneHotEncoder(sparse_output=False)

In [46]:
ohe.fit(df[['Team1']])

In [47]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['Team1']])

In [48]:
df.drop(columns = ["Team1"], inplace = True)

In [49]:
# team2
ohe = OneHotEncoder(sparse_output=False)

In [50]:
ohe.fit(df[['Team2']])

In [51]:
df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])

  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])
  df[ohe.get_feature_names_out()] = ohe.transform(df[['Team2']])


In [52]:
df.drop(columns = ["Team2"], inplace = True)

#### Season

In [53]:
label_encoder = LabelEncoder()

In [54]:
df['Season'] = label_encoder.fit_transform(df['Season'])

### Scaler

In [55]:
df.columns

Index(['Season', 'TossWinner', 'WinningTeam', 'Avg_Weighted_Score_diff',
       'batting_average_weighted_diff', 'batting_strike_rate_weighted_diff',
       'bowling_average_diff', 'bowling_economy_rate_diff', 'win_ratio_diff',
       'TossDecision_bat',
       ...
       'Team2_kochi tuskers kerala', 'Team2_kolkata knight riders',
       'Team2_lucknow super giants', 'Team2_mumbai indians',
       'Team2_pune warriors', 'Team2_punjab kings', 'Team2_rajasthan royals',
       'Team2_rising pune supergiant', 'Team2_royal challengers bangalore',
       'Team2_sunrisers hyderabad'],
      dtype='object', length=110)

In [56]:
columns_scaler = ['Avg_Weighted_Score_diff', 'batting_average_weighted_diff', 
                  'batting_strike_rate_weighted_diff', 'bowling_average_diff', 'bowling_economy_rate_diff',
                  'win_ratio_diff']

In [57]:
# Robust Scaler
scaler = RobustScaler()

In [58]:
# fit
scaler.fit(df[columns_scaler])

In [59]:
df[columns_scaler] = scaler.transform(df[columns_scaler]) 

## XG Boost Model

In [60]:
import xgboost as xgb

In [61]:
X = df.drop('WinningTeam', axis=1) 
y = df['WinningTeam']

In [62]:
# split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
# model
grid = {'max_depth': 5,
       'base_score':0.5,
       'booster': 'gblinear',
        'gamma' : 0.1,
        'n_estimators': 150,
        'learning_rate': 0.1,
        'reg_lambda': 0.3,
       }


model = xgb.XGBClassifier(**grid)

In [64]:
# fit the model
model.fit(X_train, y_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Parameters: { "gamma", "max_depth" } are not used.



In [68]:
df.columns

Index(['Season', 'TossWinner', 'WinningTeam', 'Avg_Weighted_Score_diff',
       'batting_average_weighted_diff', 'batting_strike_rate_weighted_diff',
       'bowling_average_diff', 'bowling_economy_rate_diff', 'win_ratio_diff',
       'TossDecision_bat',
       ...
       'Team2_kochi tuskers kerala', 'Team2_kolkata knight riders',
       'Team2_lucknow super giants', 'Team2_mumbai indians',
       'Team2_pune warriors', 'Team2_punjab kings', 'Team2_rajasthan royals',
       'Team2_rising pune supergiant', 'Team2_royal challengers bangalore',
       'Team2_sunrisers hyderabad'],
      dtype='object', length=110)

In [71]:
df.shape




(946, 110)

In [65]:
y_pred = model.predict(X_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [66]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Best model Accuracy: {accuracy}")

Best model Accuracy: 0.6736842105263158


In [67]:
import glob
import os
import time
import pickle
import xgboost as xgb
import pandas as pd

from typing import Any
from google.cloud import storage

from ipl_model.params import *

LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), "code", "ipl_prediction_model", "raw_data")
LOCAL_MODELS_PATH =  os.path.join(os.path.expanduser('~'), "code", "ipl_prediction_model", "model")
LOCAL_PREPROCESSORS_PATH = os.path.join(os.path.expanduser('~'), "code", "ipl_prediction_model", "preprocessor")

def save_model(model) -> None:
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    model_filename = f"{timestamp}.xgbmodel"
    model_path = os.path.join(LOCAL_MODELS_PATH, model_filename)

    # Create the directory if it doesn't exist
    os.makedirs(LOCAL_MODELS_PATH, exist_ok=True)
    # Save the model to GCS
    client = storage.Client()
    bucket = client.bucket(ipl_prediction_model_patrickevans29)
    blob = bucket.blob(f"models/{model_filename}")
    blob.upload_from_filename(model_path)

    print("✅ Model saved to GCS")

    return None



ModuleNotFoundError: No module named 'ipl_model'