Here I will explore the data encoding that will need to take place to be able to create our baseline model

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
# put the clean data into a dataframe
df = pd.read_csv('../raw_data/match_result_averages.csv')
df.head()

Unnamed: 0,ID,Team_x,Team_batting_average,WinningTeam,Team_y,Team_batting_average_B
0,1312200,Rajasthan Royals,155.397906,Gujarat Titans,Gujarat Titans,166.4375
1,1312199,Royal Challengers Bangalore,155.707965,Rajasthan Royals,Rajasthan Royals,155.397906
2,1312198,Royal Challengers Bangalore,155.707965,Royal Challengers Bangalore,Lucknow Super Giants,169.866667
3,1312197,Rajasthan Royals,155.397906,Gujarat Titans,Gujarat Titans,166.4375
4,1304116,Sunrisers Hyderabad,155.546053,Punjab Kings,Punjab Kings,160.464286


In [3]:
# Renames the columns for readability
df.rename(columns={'Team_batting_average': 'team_x_batting_average'}, inplace=True)
df.rename(columns={'Team_batting_average_B': 'team_y_batting_average'}, inplace=True)

In [4]:
# Create a new column "team_x win". 1 will indicate that team_x won the match
df['Team_x_Win'] = (df['Team_x'] == df['WinningTeam']).astype(int)
df.head()

Unnamed: 0,ID,Team_x,team_x_batting_average,WinningTeam,Team_y,team_y_batting_average,Team_x_Win
0,1312200,Rajasthan Royals,155.397906,Gujarat Titans,Gujarat Titans,166.4375,0
1,1312199,Royal Challengers Bangalore,155.707965,Rajasthan Royals,Rajasthan Royals,155.397906,0
2,1312198,Royal Challengers Bangalore,155.707965,Royal Challengers Bangalore,Lucknow Super Giants,169.866667,1
3,1312197,Rajasthan Royals,155.397906,Gujarat Titans,Gujarat Titans,166.4375,0
4,1304116,Sunrisers Hyderabad,155.546053,Punjab Kings,Punjab Kings,160.464286,0


In [5]:
# Create an initiation of the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False) 

In [6]:
# Fit and transform the encoder on the specified columns
encoded_columns = encoder.fit_transform(df[['Team_x', 'Team_y']])

In [7]:
# Create a DataFrame with the one-hot encoded columns
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['Team_x', 'Team_y']))

In [8]:
# Concatenate the encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df, encoded_df], axis=1)

In [9]:
# Drop the original columns as they are no longer needed
df_encoded.drop(['Team_x', 'Team_y', 'WinningTeam'], axis=1, inplace=True)
df_encoded.shape

(948, 40)

In [10]:
df_encoded.head(1)

Unnamed: 0,ID,team_x_batting_average,team_y_batting_average,Team_x_Win,Team_x_Chennai Super Kings,Team_x_Deccan Chargers,Team_x_Delhi Capitals,Team_x_Delhi Daredevils,Team_x_Gujarat Lions,Team_x_Gujarat Titans,...,Team_y_Kolkata Knight Riders,Team_y_Lucknow Super Giants,Team_y_Mumbai Indians,Team_y_Pune Warriors,Team_y_Punjab Kings,Team_y_Rajasthan Royals,Team_y_Rising Pune Supergiant,Team_y_Rising Pune Supergiants,Team_y_Royal Challengers Bangalore,Team_y_Sunrisers Hyderabad
0,1312200,155.397906,166.4375,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Standardise the dataframe using various scalers
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

# Save one without the scaling
df_encoded.to_csv("../raw_data/no_scaling_encoded_baseline_data.csv", index-False)

In [13]:
# Create a loop that will create three data sets scale the data in different ways
scalers = [('RobustScaler', RobustScaler()), 
           ('MinMaxScaler', MinMaxScaler()), 
           ('StandardScaler', StandardScaler())]

scaled_datasets = {}

# Loop through the scalers
for scaler_name, scaler in scalers:
    scaled_df = df_encoded.copy()
    scaled_df[['team_x_batting_average', 'team_y_batting_average']] = scaler.fit_transform(df_encoded[['team_x_batting_average', 'team_y_batting_average']])
    scaled_datasets[scaler_name] = scaled_df

In [15]:
# Save the standardised data into csv files to run different models
robust_baseline_data = scaled_datasets['RobustScaler']
simple_baseline_data = scaled_datasets['StandardScaler']
minmax_baseline_data = scaled_datasets['MinMaxScaler']

robust_baseline_data.to_csv("../raw_data/robust_baseline_data.csv", index=False)
simple_baseline_data.to_csv("../raw_data/simple_baseline_data.csv", index=False)
minmax_baseline_data.to_csv("../raw_data/minmax_baseline_data.csv", index=False)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# Define features (X) and the encoded target variable (y)
X = df_encoded.drop(['ID', 'team_x_batting_average', 'team_y_batting_average'], axis=1)
y = df_encoded['Team_x_Win']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

In [None]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

## Train a new model using all label encoded targets and ignoring average

In [None]:
# Create an initiation of the OneHotEncoder
encoder2 = LabelEncoder()

In [None]:
# Fit and transform the encoder on the specified columns
encoded_columns2 = encoder2.fit_transform(df[['Team_x', 'Team_y', 'WinningTeam']])

In [None]:
# Create a DataFrame with the one-hot encoded columns
encoded_df2 = pd.DataFrame(encoded_columns2, columns=encoder2.get_feature_names_out(['Team_x', 'Team_y', 'WinningTeam']))

In [None]:
# Concatenate the encoded DataFrame with the original DataFrame
df_encoded2 = pd.concat([df, encoded_df2], axis=1)

In [None]:
# Drop the original columns as they are no longer needed
df_encoded2.drop(['Team_x', 'Team_y'], axis=1, inplace=True)

In [None]:
df_encoded2.columns

In [None]:
# Define features (X) and the encoded target variable (y)
X2 = df_encoded2[['Team_x_Chennai Super Kings', 'Team_x_Deccan Chargers',
       'Team_x_Delhi Capitals', 'Team_x_Delhi Daredevils',
       'Team_x_Gujarat Lions', 'Team_x_Gujarat Titans',
       'Team_x_Kings XI Punjab', 'Team_x_Kochi Tuskers Kerala',
       'Team_x_Kolkata Knight Riders', 'Team_x_Lucknow Super Giants',
       'Team_x_Mumbai Indians', 'Team_x_Pune Warriors', 'Team_x_Punjab Kings',
       'Team_x_Rajasthan Royals', 'Team_x_Rising Pune Supergiant',
       'Team_x_Rising Pune Supergiants', 'Team_x_Royal Challengers Bangalore',
       'Team_x_Sunrisers Hyderabad', 'Team_y_Chennai Super Kings',
       'Team_y_Deccan Chargers', 'Team_y_Delhi Capitals',
       'Team_y_Delhi Daredevils', 'Team_y_Gujarat Lions',
       'Team_y_Gujarat Titans', 'Team_y_Kings XI Punjab',
       'Team_y_Kochi Tuskers Kerala', 'Team_y_Kolkata Knight Riders',
       'Team_y_Lucknow Super Giants', 'Team_y_Mumbai Indians',
       'Team_y_Pune Warriors', 'Team_y_Punjab Kings',
       'Team_y_Rajasthan Royals', 'Team_y_Rising Pune Supergiant',
       'Team_y_Rising Pune Supergiants', 'Team_y_Royal Challengers Bangalore',
       'Team_y_Sunrisers Hyderabad']]

y2 = df_encoded2[['WinningTeam_Deccan Chargers', 'WinningTeam_Delhi Capitals',
       'WinningTeam_Delhi Daredevils', 'WinningTeam_Gujarat Lions',
       'WinningTeam_Gujarat Titans', 'WinningTeam_Kings XI Punjab',
       'WinningTeam_Kochi Tuskers Kerala', 'WinningTeam_Kolkata Knight Riders',
       'WinningTeam_Lucknow Super Giants', 'WinningTeam_Mumbai Indians',
       'WinningTeam_Pune Warriors', 'WinningTeam_Punjab Kings',
       'WinningTeam_Rajasthan Royals', 'WinningTeam_Rising Pune Supergiant',
       'WinningTeam_Rising Pune Supergiants',
       'WinningTeam_Royal Challengers Bangalore',
       'WinningTeam_Sunrisers Hyderabad', 'WinningTeam_nan']]

In [None]:
# Split data into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=2000)
model.fit(X_train2, y_train2)