In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Loading data


In [3]:
df = pd.read_csv("/home/thibault/code/patrickevans29/ipl_prediction_model/raw_data/final_to_model_df.csv")

df = df.copy()

In [4]:
df.head()


Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,TeamA_batting_average,TeamB_batting_average,TeamA_innings_total,TeamB_innings_total
0,1312200,ahmedabad,2022-05-29,2022,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,155.397906,166.4375,130.0,133.0
1,1312199,ahmedabad,2022-05-27,2022,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,155.707965,155.397906,157.0,161.0
2,1312198,kolkata,2022-05-25,2022,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,155.707965,169.866667,207.0,193.0
3,1312197,kolkata,2022-05-24,2022,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,155.397906,166.4375,188.0,191.0
4,1304116,mumbai,2022-05-22,2022,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,155.546053,158.518349,157.0,160.0


# removing useless columns

In [5]:
df = df.drop(columns=["Date", "innings_total", "TeamA_innings_total", "TeamB_innings_total"])

In [6]:
df.head()

Unnamed: 0,ID,City,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,TeamA_batting_average,TeamB_batting_average
0,1312200,ahmedabad,2022,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,155.397906,166.4375
1,1312199,ahmedabad,2022,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,155.707965,155.397906
2,1312198,kolkata,2022,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,155.707965,169.866667
3,1312197,kolkata,2022,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,155.397906,166.4375
4,1304116,mumbai,2022,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,155.546053,158.518349


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     950 non-null    int64  
 1   City                   950 non-null    object 
 2   Season                 950 non-null    int64  
 3   MatchNumber            950 non-null    object 
 4   Team1                  950 non-null    object 
 5   Team2                  950 non-null    object 
 6   Venue                  950 non-null    object 
 7   TossWinner             950 non-null    object 
 8   TossDecision           950 non-null    object 
 9   WinningTeam            950 non-null    object 
 10  TeamA_batting_average  950 non-null    float64
 11  TeamB_batting_average  948 non-null    float64
dtypes: float64(2), int64(2), object(8)
memory usage: 89.2+ KB


# create the model

### define the features and the output

In [8]:
X = df.drop(columns=['WinningTeam'])
y = df['WinningTeam']

### encoding

In [9]:
## defining the list of categorical and numerical features 
categorical_columns = ['City', 'MatchNumber', 'Season', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision']
numerical_columns = ['TeamA_batting_average','TeamB_batting_average']
## apply the encoder, OneHotEncoder for categorical and the RobustScaler for numerical features
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

numerical_transformer = Pipeline([
    ('scaler', RobustScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns),
        ('num', numerical_transformer, numerical_columns)
    ])
## create the pipeline 
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [10]:
## creation of X_encoded with the pipeline
X_encoded = pipeline.fit_transform(X)



In [11]:
## encoding the y for the Classifier
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

### dividing the dataset train/test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2)

### Creating the model with a XGBClassifier

In [13]:
model = XGBClassifier()
model.fit(X_train, y_train)

### predicting y with the test set 

In [14]:
y_pred = model.predict(X_test)

In [15]:
y_pred

array([ 6, 12, 10,  1, 11,  4, 11, 14, 11,  2,  8,  2, 14,  6, 14,  3, 11,
       13, 11,  2, 12,  8, 12, 15,  2, 15,  6,  6,  6,  8,  0, 11, 12, 11,
       12,  0, 12,  3,  2,  0,  2, 15, 14, 14, 12,  0, 14, 15, 12, 14,  1,
        8, 11,  8,  2,  8, 11, 11,  2, 12,  8, 12,  6, 14, 13, 14,  6,  8,
        2,  8,  2,  2,  8, 11, 14, 11, 12, 11,  0, 12, 15,  8, 15,  6, 15,
       11,  2, 12,  2,  0, 12, 11,  0,  6,  0,  0,  0,  2,  8, 15,  2, 14,
        6,  6, 14, 14,  8,  1,  8, 12, 12, 15, 11,  2, 15, 14,  2, 15, 12,
       10,  0,  2, 14,  2,  2,  1,  6,  4, 12,  6, 14,  0,  8,  6,  7,  2,
        6,  8,  6, 14, 11,  2, 14, 11,  6,  8, 12, 15,  6,  8,  2, 12,  0,
       14,  6,  0,  0,  8,  8, 12,  0,  8,  8, 14,  6,  2,  2,  0,  6,  2,
       15, 11, 14, 11,  2, 12, 12,  2, 11, 14,  2, 14, 15, 14,  2,  0, 14,
        2, 14, 14])

In [16]:
## inverse the encoding to have the name of the winner
y_pred_names = label_encoder.inverse_transform(y_pred)

In [17]:
result_df = pd.DataFrame({'True Labels': y_test, 'Predicted Labels': y_pred})

# Ajoutez une colonne pour indiquer si les prédictions sont correctes
result_df['Correct Prediction'] = result_df['True Labels'] == result_df['Predicted Labels']

# Affichez la DataFrame pour vérifier les résultats
print(result_df)

     True Labels  Predicted Labels  Correct Prediction
0              5                 6               False
1             12                12                True
2              0                10               False
3              1                 1                True
4             11                11                True
..           ...               ...                 ...
185            0                 0                True
186           14                14                True
187           14                 2               False
188           12                14               False
189            8                14               False

[190 rows x 3 columns]


In [18]:
y_pred_series = pd.Series(y_pred_names)
y_pred_series.value_counts()

delhi capitals                 31
royal challengers bangalore    27
rajasthan royals               23
mumbai indians                 22
kolkata knight riders          21
punjab kings                   21
chennai super kings            18
sunrisers hyderabad            14
deccan chargers                 4
pune warriors                   2
gujarat titans                  2
gujarat lions                   2
rising pune supergiant          2
lucknow super giants            1
Name: count, dtype: int64

In [23]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_encoded, y_encoded, cv=5, scoring='accuracy')




In [24]:
scores.mean()

0.4726315789473684

## accuracy of the model 

In [25]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5473684210526316