In [None]:
# Name : Dhihan Ahmed, Luke Bianchi, Jacob Gurevich
# Assignment Number & Name : Naive Bayes - NHL Win Classification
# I pledge my honor that I have abided by the Stevens Honor System.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset
df = pd.read_csv("merged_with_event_features.csv")

# Drop exact duplicates of team-game combinations
df = df.drop_duplicates(subset=['game_id', 'team_id'])

# Drop identifier columns that shouldn't be used as features
df = df.drop(columns=['game_id', 'team_id'], errors='ignore')
# Convert 'HoA' to binary: 1 = Home, 0 = Away
df['is_home'] = df['HoA'].map({'home': 1, 'away': 0})

# Define feature list
features = ['shots', 'hits', 'pim', 'powerPlayGoals', 'faceOffWinPercentage',
            'giveaways', 'takeaways', 'blocked', 'avg_shot_x', 'avg_shot_y',
            'close_range', 'num_slap_shot', 'num_wrist_shot', 'won', 'num_backhand','num_slap_shot','num_tip-in', 'is_home']
# Filter and convert types
df = df[features].dropna()
df['won'] = df['won'].astype(int)
print("Final row count after filtering:", len(df))
print("Win class distribution:")
print(df['won'].value_counts())

df.head()

Final row count after filtering: 24924
Win class distribution:
won
0    12462
1    12462
Name: count, dtype: int64


Unnamed: 0,shots,hits,pim,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,blocked,avg_shot_x,avg_shot_y,close_range,num_slap_shot,num_wrist_shot,won,num_backhand,num_slap_shot.1,num_tip-in
0,27.0,30.0,6.0,2.0,50.9,12.0,9.0,11.0,17.962963,-5.296296,0.0,3.0,18.0,0,3.0,3.0,2.0
1,28.0,20.0,8.0,2.0,49.1,16.0,8.0,9.0,-10.392857,-2.857143,0.0,7.0,16.0,1,3.0,7.0,0.0
2,34.0,16.0,6.0,1.0,43.8,7.0,4.0,14.0,-9.647059,-4.088235,1.0,6.0,16.0,1,6.0,6.0,3.0
3,33.0,17.0,8.0,1.0,56.2,5.0,6.0,14.0,46.0,0.636364,0.0,3.0,20.0,0,3.0,3.0,2.0
4,29.0,17.0,9.0,1.0,45.7,13.0,5.0,20.0,30.0,-0.413793,2.0,3.0,16.0,1,3.0,3.0,0.0


In [None]:
# Define X and y
X = df.drop("won", axis=1)
y = df["won"]

# Stratified train-test split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [31]:
# Run KNN for different k values
for k in [3, 5, 10]:
    print(f"--- Results for k = {k} ---")
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

--- Results for k = 3 ---
Accuracy: 0.5616273471352913
Confusion Matrix:
[[3494 2737]
 [2726 3505]]
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56      6231
           1       0.56      0.56      0.56      6231

    accuracy                           0.56     12462
   macro avg       0.56      0.56      0.56     12462
weighted avg       0.56      0.56      0.56     12462

--- Results for k = 5 ---
Accuracy: 0.573503450489488
Confusion Matrix:
[[3638 2593]
 [2722 3509]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.58      0.58      6231
           1       0.58      0.56      0.57      6231

    accuracy                           0.57     12462
   macro avg       0.57      0.57      0.57     12462
weighted avg       0.57      0.57      0.57     12462

--- Results for k = 10 ---
Accuracy: 0.5816883325308939
Confusion Matrix:
[[4263 1968]
 [3245 2986]]
