In [190]:
# set matplotlib backend to inline
%matplotlib inline

# import modules
from sklearn import datasets
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# load data
# wine = datasets.load_wine()
#print(wine.DESCR)

# this dataset has 13 features, we will only choose a subset of these
# df_wine = pd.DataFrame(wine.data, columns=wine.feature_names)
# selected_features = ['alcohol', 'flavanoids', 'color_intensity', 'ash']

# extract the data as numpy arrays of features, X, and target, y
# X = df_wine[selected_features].values
# y = wine.target


In [191]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

raw_nhl_data = pd.read_csv('moneypack/all_teams.csv')
columns_to_drop_from_total = ['name', 'gameId', 'playerTeam', 'opposingTeam', 'home_or_away', 'gameDate', 'position',
                              'situation', 'iceTime']
raw_nhl_data = raw_nhl_data.drop(columns=columns_to_drop_from_total)
raw_nhl_data = raw_nhl_data[(raw_nhl_data['season'] != '2023')]

raw_nhl_data_2023 = pd.read_csv('moneypack/all_teams_2023.csv')
raw_nhl_data_2023['playoffGame'] = [0] * len(raw_nhl_data_2023.index)
columns_to_drop_from_2023 = ['games_played', 'name', 'position', 'situation', 'team.1', 'iceTime']
raw_nhl_data_2023 = raw_nhl_data_2023.drop(columns=columns_to_drop_from_2023)

columns_all = raw_nhl_data.columns
columns_2023 = raw_nhl_data_2023.columns

# print(len(raw_nhl_data.columns))
# print(len(raw_nhl_data_2023.columns))

# print(columns_all.difference(columns_2023))
# print(columns_2023.difference(columns_all))

# print(len(raw_nhl_data.values))
# print(len(raw_nhl_data_2023.values))

# print(raw_nhl_data.values)
# print(raw_nhl_data_2023['playoffGame'].values)


by_team_season_all = raw_nhl_data.groupby(['team', 'season']).mean()
by_team_season_all['playoff_qualified'] = by_team_season_all['playoffGame'].apply(lambda val: 1 if val > 0.0 else 0.0)
#print(by_team_season_all)
# print(type(by_team_season_all))

X = by_team_season_all.drop(columns=['playoff_qualified', 'playoffGame'])
y = by_team_season_all['playoff_qualified']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)
# Standardize the numerical features using StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# print(pd.DataFrame(X_train_scaled, columns=X.columns).head())


In [None]:
corr_matrix = X.corr().abs()

# Plot the heatmap
plt.figure(figsize=(100, 100))
sns.heatmap(corr_matrix, annot=True, cmap='GnBu', linewidths=0.2, vmin=0, vmax=1)
plt.xlabel('Features')
plt.ylabel('Features')
plt.title('Feature Importances using Correlation Matrix Heatmap')
plt.show()

In [None]:

corr_with_target = X.corrwith(y)
corr_with_target = corr_with_target.sort_values(ascending=False)
corr_with_target.reset_index(name='score')

# selected_corr_with_target = pd.DataFrame(corr_with_target, columns=['score'])
# selected_corr_with_target.columns = ["feature", "score"]

corr_with_target = corr_with_target.loc[lambda x : x > 0.1]
# print(corr_with_target.index)
# Plot the heatmap
plt.figure(figsize=(4, 14))
sns.heatmap(corr_with_target.to_frame(), cmap='GnBu', annot=True)
plt.title('Correlation with Target Variable')
# plt.show()

corr_with_target_frame = pd.DataFrame({"feature": corr_with_target.index, "score": corr_with_target.values})


In [None]:
from sklearn.feature_selection import SelectKBest, chi2

# apply univariate feature selection
best_features = SelectKBest(score_func=chi2, k=5).fit(X, y)

# get the scores and selected features
scores = best_features.scores_
selected_features = X.columns[best_features.get_support()]

sorted_idxs = np.argsort(scores)[::-1]
sorted_scores = scores[sorted_idxs]
sorted_feature_names = np.array(X.columns)[sorted_idxs]

sorted_features_with_scores = pd.DataFrame({"feature" : sorted_feature_names, "score" : sorted_scores})
sorted_features_with_scores = sorted_features_with_scores[sorted_features_with_scores['score'] > 1.0]
# print(sorted_features_with_scores)

# plot scores
plt.figure(figsize=(20, 20))
sns.barplot(x=sorted_scores, y=sorted_feature_names)
plt.xlabel('Scores')
plt.ylabel('Features')
plt.title('Feature Importances using Univariate Feature Selection (Chi-square test)')
plt.show()

In [None]:
# selected_important_features_data = pd.merge(sorted_features_with_scores, corr_with_target_frame, how ='left', on =['feature'])
selected_important_features_data = sorted_features_with_scores
important_features = selected_important_features_data["feature"].values