# Assignment 1: Predicting Football Match Outcomes
This notebook includes all necessary steps from data preprocessing to model evaluation.

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, make_scorer

# Load dataset
df = pd.read_csv("international_matches.csv")
df.head()

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,1993-08-08,Bolivia,Uruguay,South America,South America,59,22,0,0,3,...,No,Win,,,,,,,,
1,1993-08-08,Brazil,Mexico,South America,North America,8,14,0,0,1,...,No,Draw,,,,,,,,
2,1993-08-08,Ecuador,Venezuela,South America,South America,35,94,0,0,5,...,No,Win,,,,,,,,
3,1993-08-08,Guinea,Sierra Leone,Africa,Africa,65,86,0,0,1,...,No,Win,,,,,,,,
4,1993-08-08,Paraguay,Argentina,South America,South America,67,5,0,0,1,...,No,Lose,,,,,,,,


In [5]:
import pandas as pd

df = pd.read_csv("international_matches.csv")
df['date'] = pd.to_datetime(df['date'])  # Make sure the date is in datetime format

In [7]:
df['tournament'].unique()

array(['FIFA World Cup qualification', 'Friendly',
       'African Cup of Nations qualification', 'Amílcar Cabral Cup',
       'CFU Caribbean Cup qualification',
       'United Arab Emirates Friendship Tournament',
       'Malta International Tournament', 'Lunar New Year Cup',
       'African Cup of Nations', 'CFU Caribbean Cup',
       'UEFA Euro qualification', 'Kirin Cup', 'FIFA World Cup',
       'Oceania Nations Cup qualification', 'Baltic Cup', 'Gulf Cup',
       'Simba Tournament', 'CECAFA Cup', 'Confederations Cup',
       'Dynasty Cup', "King's Cup", 'Nehru Cup', 'SAFF Cup',
       'Copa Paz del Chaco', 'Korea Cup', 'USA Cup', 'Copa América',
       'Merdeka Tournament', 'South Pacific Games', 'UNCAF Cup',
       'Oceania Nations Cup', 'Windward Islands Tournament', 'Gold Cup',
       'AFC Asian Cup qualification', 'UEFA Euro', 'AFF Championship',
       'AFC Asian Cup', 'King Hassan II Tournament',
       'Cyprus International Tournament', 'Dunhill Cup',
       'COSAFA Cup qu

In [4]:
wc_finals = df[df['tournament'] == 'FIFA World Cup']
print(wc_finals.shape)  # Check number of matches
print(wc_finals['date'].dt.year.value_counts().sort_index())  # See which years

(432, 25)
date
1994    52
1998    60
2002    64
2006    64
2010    64
2014    64
2018    64
Name: count, dtype: int64


In [8]:
wc_qualifiers = df[df['tournament'] == 'FIFA World Cup qualification']
print(wc_qualifiers.shape)  # Check number of matches
print(wc_qualifiers['date'].dt.year.value_counts().sort_index())  # See which years

(5528, 25)
date
1993     94
1996    209
1997    396
2000    274
2001    480
2003     71
2004    400
2005    347
2007     70
2008    414
2009    321
2011    203
2012    241
2013    346
2015    233
2016    256
2017    310
2019    136
2020     21
2021    607
2022     99
Name: count, dtype: int64


## Step 2: Preprocessing

In [12]:
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv("international_matches.csv")
df['date'] = pd.to_datetime(df['date'])

In [14]:
# Step 2: Filter FIFA World Cup qualification matches from 2019 to 2022
wcq_recent = df[
    (df['tournament'] == 'FIFA World Cup qualification') &
    (df['date'].dt.year >= 2019) & (df['date'].dt.year <= 2022)
].copy()

In [16]:
# Step 3: Drop columns with >30% missing values
threshold = len(wcq_recent) * 0.7
wcq_cleaned = wcq_recent.dropna(thresh=threshold, axis=1).copy()

In [18]:
# Step 4: Extract year
wcq_cleaned['year'] = wcq_cleaned['date'].dt.year

In [20]:
# Step 5: Drop all categorical columns (text and boolean)
categorical_columns = wcq_cleaned.select_dtypes(include=['object', 'bool']).columns
wcq_numeric = wcq_cleaned.drop(columns=categorical_columns)

In [22]:
# Step 6: Define the label (target) BEFORE dropping it from X
y = wcq_cleaned['home_team_result']  # still from original cleaned data

In [24]:
# Step 7: Drop 'home_team_result' from features IF it's still there
if 'home_team_result' in wcq_numeric.columns:
    X = wcq_numeric.drop(columns='home_team_result')
else:
    X = wcq_numeric

In [26]:
# Step 8: Show shape
print("Feature shape (X):", X.shape)
print("Target shape (y):", y.shape)

Feature shape (X): (863, 8)
Target shape (y): (863,)


## Step 3: Model Training and Evaluation

In [28]:
from sklearn.metrics import accuracy_score

In [30]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, make_scorer

In [34]:
# Drop datetime column from numeric data if it still exists
if 'date' in wcq_numeric.columns:
    wcq_numeric = wcq_numeric.drop(columns='date')

# Ensure correct features and labels
if 'home_team_result' in wcq_numeric.columns:
    X = wcq_numeric.drop(columns='home_team_result')
else:
    X = wcq_numeric

# Re-define target
y = wcq_cleaned['home_team_result']

# Re-scale numeric features for kNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Stratified 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_macro = make_scorer(f1_score, average='macro')
acc_score = make_scorer(accuracy_score)

# Models
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Evaluate kNN
knn_f1 = cross_val_score(knn, X_scaled, y, cv=skf, scoring=f1_macro)
knn_acc = cross_val_score(knn, X_scaled, y, cv=skf, scoring=acc_score)

# Evaluate Random Forest
rf_f1 = cross_val_score(rf, X, y, cv=skf, scoring=f1_macro)
rf_acc = cross_val_score(rf, X, y, cv=skf, scoring=acc_score)

# Final results
{
    "kNN F1 Macro": knn_f1.mean(),
    "kNN Accuracy": knn_acc.mean(),
    "Random Forest F1 Macro": rf_f1.mean(),
    "Random Forest Accuracy": rf_acc.mean()
}

{'kNN F1 Macro': 0.8316316501129934,
 'kNN Accuracy': 0.8574405161984139,
 'Random Forest F1 Macro': 0.9831827130487207,
 'Random Forest Accuracy': 0.9860935609624949}