In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from datetime import datetime  

# Load your dataset (replace 'your_dataset.csv' with your actual dataset file)
df = pd.read_csv('../data/current.csv')

df.drop(df['skaterFullName'].loc[df['skaterFullName'] == 'Taylor Hall'].index, inplace=True)
df = df.drop(columns=['skaterFullName', 'teamAbbrevs', 'vs', 'shoots', 'row', 'date']) 
df = df.dropna()
df.head()

df['avgPowerplayToi'] = df['avgPowerplayToi'].apply(lambda x: datetime.strptime(str(x), "%M:%S").second + datetime.strptime(str(x), "%M:%S").minute * 60)

df['position'].replace(['C', 'R', 'L', 'D'], [3, 2, 2, 1], inplace=True)
df.head()

features = ['ppPoints', 'gamesPlayed', 'avgPowerplayToi', 'team_PEN/GP', 'team_PP%', 'team_PK%', 'team_GA', 'team_G', 'enemy_PEN/GP', 'enemy_PP%', 'enemy_PK%', 'enemy_GA', 'enemy_G']

X = df[features].values
y = df['scored'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# Fit and transform the training data using the RandomUnderSampler
#  X_train, y_train = undersampler.fit_resample(X_train, y_train)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

# Instantiate individual classifiers (you can customize these)
clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2 = RandomForestClassifier(n_estimators=50, random_state=42)
clf3 = RandomForestClassifier(n_estimators=150, random_state=42)

# Create an ensemble of classifiers using a VotingClassifier
ensemble_clf = VotingClassifier(estimators=[('clf1', clf1), ('clf2', clf2), ('clf3', clf3)], voting='hard')

# Train the ensemble classifier on the training data
ensemble_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ensemble_clf.predict(X_test)

# Evaluate the performance of the ensemble model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Accuracy: 0.78
Precision: 0.47
Recall: 0.41
