# Assignment 1: Predicting Football Match Outcomes
This notebook includes all necessary steps from data preprocessing to model evaluation.

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, make_scorer

# Load dataset
df = pd.read_csv("international_matches.csv")
df.head()

## Step 2: Preprocessing

In [None]:
# Drop columns with more than 30% missing values
threshold = len(df) * 0.7
df_processed = df.dropna(thresh=threshold, axis=1)

# Extract year from date
df_processed['date'] = pd.to_datetime(df_processed['date'])
df_processed['year'] = df_processed['date'].dt.year

# Drop redundant columns
df_processed = df_processed.drop(columns=['city', 'date'])

# Categorical columns
categorical_columns = ['home_team', 'away_team', 'home_team_continent', 'away_team_continent',
                       'tournament', 'country', 'shoot_out', 'home_team_result']

# One-hot encoding
df_encoded = pd.get_dummies(df_processed, columns=[col for col in categorical_columns if col != 'home_team_result'])

# Split into features and target
X = df_encoded.drop(columns='home_team_result')
y = df_encoded['home_team_result']

# Scale features for kNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Step 3: Model Training and Evaluation

In [None]:
# Setup cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_macro = make_scorer(f1_score, average='macro')

# kNN
knn = KNeighborsClassifier(n_neighbors=5)
knn_scores = cross_val_score(knn, X_scaled, y, cv=skf, scoring=f1_macro)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf, X, y, cv=skf, scoring=f1_macro)

print(f"Mean F1 Macro Score (kNN): {knn_scores.mean():.3f}")
print(f"Mean F1 Macro Score (Random Forest): {rf_scores.mean():.3f}")

### Summary:
Random Forest outperforms kNN on this dataset using macro-averaged F1 score.