In [286]:
import pandas as pd
nhl_df = pd.read_csv('C:/Users/Martin/Downloads/mvp_nhl.csv')

In [287]:
name_mapping = {
    'Buffalo Sabres': 'Sabres',
    'Colorado Avalanche': 'Avalanche',
    'Columbus Blue Jackets': 'Blue Jackets',
    'Detroit Red Wings': 'Red Wings',
    'Nashville Predators': 'Predators',
    'Ottawa Senators': 'Senators'
}

nhl_df['home_team'] = nhl_df['home_team'].replace(name_mapping)
nhl_df['away_team'] = nhl_df['away_team'].replace(name_mapping)

In [288]:
home_data = nhl_df[['home_team', 'on_ice_shots_differential', 'average_shot_distance', 'on_ice_shots_for', 'corsi_for',
                      'corsi_total']]
away_data = nhl_df[['away_team', 'on_ice_shots_differential', 'average_shot_distance', 'on_ice_shots_against',
                      'corsi_total']]

In [289]:
home_data.columns = ['team', 'on_ice_shots_differential', 'average_shot_distance', 'on_ice_shots_for', 'corsi_total', 'corsi_for']
away_data.columns = ['team', 'on_ice_shots_differential', 'average_shot_distance', 'on_ice_shots_against', 'corsi_total']

In [290]:
combined_data = pd.concat([home_data, away_data])

In [291]:
team_averages_correct = combined_data.groupby('team').mean().reset_index()

In [292]:
weight_by_season_end_standings = {
    'Rangers': 8, 'Bruins': 8, 'Stars': 8, 'Canucks': 8, 'Avalanche': 7, 'Oilers': 7, 'Panthers': 7,
    'Hurricanes': 7, 'Islanders': 6, 'Maple Leafs': 6, 'Jets': 6, 'Golden Knights': 6, 'Kings': 5,
    'Predators': 5, 'Flyers': 5, 'Lightning': 5, 'Blues': 4, 'Kraken': 4, 'Capitals': 4, 'Red Wings': 4,
    'Penguins': 3, 'Sabres': 3, 'Wild': 3, 'Flames': 3, 'Coyotes': 2, 'Ducks': 2, 'Devils': 2,
    'Senators': 2, 'Blackhawks': 1, 'Sharks': 1, 'Blue Jackets': 1, 'Canadiens': 1
}

weights_df = pd.DataFrame(list(weight_by_season_end_standings.items()), columns=['team', 'team_weight'])

In [293]:
home_attendance = nhl_df[['home_team', 'attendance']]
home_attendance.columns = ['team', 'attendance']
average_attendance = home_attendance.groupby('team').attendance.mean().reset_index()

In [294]:
full_team_data = pd.merge(team_averages_correct, weights_df, on='team', how='left')

In [295]:
team_data = pd.merge(full_team_data, average_attendance, on='team', how='left')

In [296]:
team_data.head()

Unnamed: 0,team,on_ice_shots_differential,average_shot_distance,on_ice_shots_for,corsi_total,corsi_for,on_ice_shots_against,team_weight,attendance
0,Avalanche,0.924528,36.524528,26.28,22.188679,8.36,28.071429,7,17698.24
1,Blackhawks,1.181818,36.047273,21.1,30.109091,-10.033333,19.84,1,16131.866667
2,Blue Jackets,-1.0,36.444643,24.90625,31.803571,-7.28125,24.375,1,15988.40625
3,Blues,3.019608,35.1,25.173913,26.45098,-2.695652,23.571429,4,18019.826087
4,Bruins,2.125,35.466071,27.6,26.785714,6.533333,26.846154,8,18563.1


In [297]:
team_data.shape

(32, 9)

In [298]:
pd.set_option('display.max_rows', None)

In [309]:
print(team_data)

              team  on_ice_shots_differential  average_shot_distance  \
0        Avalanche                   0.924528              36.524528   
1       Blackhawks                   1.181818              36.047273   
2     Blue Jackets                  -1.000000              36.444643   
3            Blues                   3.019608              35.100000   
4           Bruins                   2.125000              35.466071   
5        Canadiens                   1.859649              32.661404   
6          Canucks                   0.814815              34.903704   
7         Capitals                   2.033333              35.631667   
8          Coyotes                   2.961538              33.907692   
9           Devils                   1.740741              35.011111   
10           Ducks                   1.839286              34.014286   
11          Flames                   0.433962              36.184906   
12          Flyers                   1.571429              37.23

In [300]:
import pickle

In [301]:
with open('rf_classifier.pkl', 'rb') as file:
    rf_classifier = pickle.load(file)


In [302]:
with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [303]:
import numpy as np

def predict_game_winner(team1, team2, team_data, model, scaler, home_status):
    if home_status == 'home':
        features = prepare_features(team1, team2, team_data, scaler)
        print(f"Game setting: {team1} at home vs {team2}")
    else:
        features = prepare_features(team2, team1, team_data, scaler)
        print(f"Game setting: {team2} at home vs {team1}")
    
    print(f"Features used for prediction: {features}")
    prediction = model.predict(features)
    predicted_winner = team1 if prediction == 1 else team2
    print(f"Predicted winner: {predicted_winner}")
    return predicted_winner

In [308]:
def prepare_features(team1, team2, team_data, scaler):
    feature_columns = ['pdo', 'on_ice_shots_differential', 'average_shot_distance', 'on_ice_shots_for', 'corsi_for', 'corsi_total', 'attendance', 'home_team_weight', 'away_team_weight']
    
    team1_features = team_data.loc[team_data['team'] == team1, feature_columns]
    team2_features = team_data.loc[team_data['team'] == team2, feature_columns]

    if not team1_features.empty and not team2_features.empty:
        combined_features = np.concatenate([team1_features.values, team2_features.values], axis=1)
        
        if combined_features.shape[1] != len(feature_columns) * 2:
            raise ValueError("Feature dimension mismatch.")
        
        scaled_features = scaler.transform(combined_features.reshape(1, -18))
        return scaled_features
    else:
        raise ValueError("Data for one or both teams is missing.")

Error in preparing features: "['pdo', 'home_team_weight', 'away_team_weight'] not in index"


In [307]:
prepare_features('Avalanche','Panthers', team_data, scaler)



ValueError: X has 16 features, but StandardScaler is expecting 9 features as input.

In [262]:
predict_game_winner('Avalanche','Panthers', team_data, rf_classifier, scaler, home_status)

NameError: name 'home_status' is not defined

In [282]:
def simulate_best_of_seven(team1, team2, team_data, model, scaler):
    home_away_sequence = ['home', 'home', 'away', 'away', 'home', 'away', 'home']
    team1_wins = 0
    team2_wins = 0
    game_count = 0

    for setting in home_away_sequence:
        if team1_wins == 4 or team2_wins == 4:
            break

        if setting == 'home':
            winner = predict_game_winner(team1, team2, team_data, model, scaler, 'home')
        else:
            winner = predict_game_winner(team1, team2, team_data, model, scaler, 'away')

        if winner == team1:
            team1_wins += 1
        else:
            team2_wins += 1

        game_count += 1
        print(f"Current Score - {team1}: {team1_wins}, {team2}: {team2_wins}")

    return {
        'winner': team1 if team1_wins == 4 else team2,
        'team1_wins': team1_wins,
        'team2_wins': team2_wins,
        'games_played': game_count
    }

In [283]:
def simulate_playoff_round(matchups, team_data, model, scaler):
    results = []
    for team1, team2 in matchups:
        series_result = simulate_best_of_seven(team1, team2, team_data, model, scaler)
        results.append(series_result)
    return results

In [284]:
playoff_matchups = [
    ('Bruins', 'Lightning'),
    ('Panthers', 'Maple Leafs'),
    ('Rangers', 'Capitals'),
    ('Islanders', 'Hurricanes'),
    ('Stars', 'Golden Knights'),
    ('Avalanche', 'Jets'),
    ('Canucks','Predators'),
    ('Oilers', 'Kings')
]

In [285]:
playoff_results = simulate_playoff_round(playoff_matchups, team_data, rf_classifier, scaler)
for result in playoff_results:
    print(f"{result['winner']} wins series {result['team1_wins']}-{result['team2_wins']} in {result['games_played']} games.")



Game setting: Bruins at home vs Lightning
Features used for prediction: [[ 0.61684067  2.64185845 -5.50996191 -2.55547021 -0.3490617   0.12989634
  -4.29387645 13.39618521  9.80342662]]
Predicted winner: Lightning
Current Score - Bruins: 0, Lightning: 1
Game setting: Bruins at home vs Lightning
Features used for prediction: [[ 0.61684067  2.64185845 -5.50996191 -2.55547021 -0.3490617   0.12989634
  -4.29387645 13.39618521  9.80342662]]
Predicted winner: Lightning
Current Score - Bruins: 0, Lightning: 2
Game setting: Lightning at home vs Bruins
Features used for prediction: [[ 0.80718137  2.60943869 -6.43508165 -3.01658303  0.10173593  0.31240892
  -4.3939864  12.29603788  7.29719783]]
Predicted winner: Lightning
Current Score - Bruins: 0, Lightning: 3
Game setting: Lightning at home vs Bruins
Features used for prediction: [[ 0.80718137  2.60943869 -6.43508165 -3.01658303  0.10173593  0.31240892
  -4.3939864  12.29603788  7.29719783]]
Predicted winner: Lightning
Current Score - Bruins: 



In [234]:
importances = rf_classifier.feature_importances_
print("Feature importances:", importances)

Feature importances: [0.03665626 0.33999106 0.05208181 0.04376388 0.04690843 0.02682716
 0.03591272 0.03147343 0.33394265 0.02603247 0.02641014]
