<a href="https://colab.research.google.com/github/mizzony/AFL/blob/main/AFL_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the cleaned dataset
data = pd.read_csv('https://raw.githubusercontent.com/mizzony/AFL/refs/heads/main/afl_data.csv')
# Print the first few rows to verify the change
data.head()

#Explore the dataset
data.describe()
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2447 entries, 0 to 2446
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   GameId            2447 non-null   object 
 1   Year              2447 non-null   int64  
 2   Round             2447 non-null   object 
 3   Date              2447 non-null   object 
 4   MaxTemp           2440 non-null   float64
 5   MinTemp           2440 non-null   float64
 6   Rainfall          2425 non-null   float64
 7   Venue             2447 non-null   object 
 8   StartTime         2447 non-null   object 
 9   Attendance        2447 non-null   object 
 10  HomeTeam          2447 non-null   object 
 11  HomeTeamScoreQT   2447 non-null   float64
 12  HomeTeamScoreHT   2447 non-null   float64
 13  HomeTeamScore3QT  2447 non-null   float64
 14  HomeTeamScoreFT   2447 non-null   float64
 15  HomeTeamScore     2447 non-null   int64  
 16  AwayTeam          2447 non-null   object 


In [24]:

# Features and target
X = data.drop(columns=['Win'])  # Features
y = data['Win']  # Target (0 or 1)

# Convert 'object' type columns to categorical
# Select object columns
object_cols = X.select_dtypes(include=['object']).columns

# Convert to categorical
for col in object_cols:
    X[col] = X[col].astype('category')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to DMatrix format for XGBoost
# Set enable_categorical=True to handle categorical features
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

# Set up XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'eta': 0.1,
    'seed': 42
}

# Train the XGBoost model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Predict on the test set
y_pred_probs = bst.predict(dtest)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probs]

In [25]:


# Create a new dataset with predictions and 'Win' column
# Reset index of X_test and y_test to ensure alignment with y_pred
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Create a copy of the test set and add predictions and the original 'Win' column
afl_data2 = X_test.copy()
afl_data2['Win'] = y_test
afl_data2['Predicted_Win'] = y_pred

# Print the new dataset with original and predicted values
print(afl_data2.head())

# Export the new dataset to a CSV file
afl_data2.to_csv('afl_data_with_predictions.csv', index=False)



      GameId  Year Round        Date  MaxTemp  MinTemp  Rainfall  \
0   2022EF01  2022    EF  2022-09-01     22.8     13.3       0.0   
1  2017R2306  2017   R23  2017-08-26     14.3      4.1       0.0   
2  2017R1509  2017   R15  2017-07-02     14.6      6.0      21.8   
3  2021R1706  2021   R17  2021-07-11     10.4     -1.0       0.2   
4  2016R0608  2016    R6  2016-05-01     16.8     14.0       9.8   

            Venue StartTime Attendance  ... HomeTeamScoreFT  HomeTeamScore  \
0           Gabba   7:20 PM     35,013  ...           16.10            106   
1   Adelaide Oval   7:10 PM     34,288  ...           20.15            135   
2         Subiaco   2:40 PM     30,541  ...           12.80             80   
3  Eureka Stadium  12:40 PM      2,952  ...            9.10             64   
4          M.C.G.   3:20 PM     43,827  ...           10.12             72   

     AwayTeam  AwayTeamScoreQT  AwayTeamScoreHT  AwayTeamScore3QT  \
0    Richmond              4.3              9.5      

In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(report)

Accuracy: 0.98
Confusion Matrix:
[[213   6]
 [  2 269]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       219
           1       0.98      0.99      0.99       271

    accuracy                           0.98       490
   macro avg       0.98      0.98      0.98       490
weighted avg       0.98      0.98      0.98       490

