In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Loading the datasets
games = pd.read_csv('C:/Users/patron/Desktop/MATH 748/NBA Data/games.csv')
games_details = pd.read_csv('C:/Users/patron/Desktop/MATH 748/NBA Data/games_details.csv')
players = pd.read_csv('C:/Users/patron/Desktop/MATH 748/NBA Data/players.csv')
ranking = pd.read_csv('C:/Users/patron/Desktop/MATH 748/NBA Data/ranking.csv')

# Basic exploration to check the shape and first few rows of each dataset
print("Games Dataset:")
print(games.shape)
print(games.head())

print("\nGames Details Dataset:")
print(games_details.shape)
print(games_details.head())

print("\nPlayers Dataset:")
print(players.shape)
print(players.head())

print("\nRanking Dataset:")
print(ranking.shape)
print(ranking.head())

# Checking for missing values in each dataset
print("\nMissing Values in Games Dataset:")
print(games.isnull().sum())

print("\nMissing Values in Games Details Dataset:")
print(games_details.isnull().sum())

print("\nMissing Values in Players Dataset:")
print(players.isnull().sum())

print("\nMissing Values in Ranking Dataset:")
print(ranking.isnull().sum())

# Handling any missing values
games_details_cleaned = games_details.dropna()
games_details_cleaned = games_details.fillna(games_details.mean())

# Feature selection
games_selected = games.drop(columns=['GAME_ID', 'GAME_STATUS_TEXT'])
games_details_selected = games_details.drop(columns=['PLAYER_ID', 'NICKNAME', 'COMMENT'])

# Encoding the categorical variables
games_details_selected['TEAM_ABBREVIATION'] = games_details_selected['TEAM_ABBREVIATION'].astype('category').cat.codes

# Scaling some of the features
scaler = StandardScaler()
games_selected[['PTS_home', 'PTS_away', 'REB_home', 'REB_away']] = scaler.fit_transform(
    games_selected[['PTS_home', 'PTS_away', 'REB_home', 'REB_away']])

  games_details = pd.read_csv('C:/Users/patron/Desktop/MATH 748/NBA Data/games_details.csv')


Games Dataset:
(26651, 21)
  GAME_DATE_EST   GAME_ID GAME_STATUS_TEXT  HOME_TEAM_ID  VISITOR_TEAM_ID  \
0    2022-12-22  22200477            Final    1610612740       1610612759   
1    2022-12-22  22200478            Final    1610612762       1610612764   
2    2022-12-21  22200466            Final    1610612739       1610612749   
3    2022-12-21  22200467            Final    1610612755       1610612765   
4    2022-12-21  22200468            Final    1610612737       1610612741   

   SEASON  TEAM_ID_home  PTS_home  FG_PCT_home  FT_PCT_home  ...  AST_home  \
0    2022    1610612740     126.0        0.484        0.926  ...      25.0   
1    2022    1610612762     120.0        0.488        0.952  ...      16.0   
2    2022    1610612739     114.0        0.482        0.786  ...      22.0   
3    2022    1610612755     113.0        0.441        0.909  ...      27.0   
4    2022    1610612737     108.0        0.429        1.000  ...      22.0   

   REB_home  TEAM_ID_away  PTS_away  FG_P

In [None]:
# Generating summary statistics for key features in the games dataset
print("\nSummary statistics for Games dataset:")
print(games_selected[['PTS_home', 'PTS_away', 'FG_PCT_home', 'FG_PCT_away', 'REB_home', 'REB_away']].describe())

# Visualizing the distributions of key features using histograms
plt.figure(figsize=(14, 6))

# Histogram for points scored by home teams
plt.subplot(1, 2, 1)
sns.histplot(games_selected['PTS_home'], kde=True, color='blue')
plt.title('Distribution of Points for Home Teams')

# Histogram for points scored by away teams
plt.subplot(1, 2, 2)
sns.histplot(games_selected['PTS_away'], kde=True, color='green')
plt.title('Distribution of Points for Away Teams')

plt.show()

# Step 12: Boxplots for comparing performance of home vs away teams
plt.figure(figsize=(12, 6))

# Boxplot for home and away points
sns.boxplot(data=games_selected[['PTS_home', 'PTS_away']])
plt.title('Boxplot Comparison of Home and Away Points')
plt.ylabel('Points')
plt.xticks([0, 1], ['Home Points', 'Away Points'])

plt.show()

# Correlation heatmap to identify relationships between features
plt.figure(figsize=(10, 8))
corr_matrix = games_selected[['PTS_home', 'PTS_away', 'FG_PCT_home', 'FG_PCT_away', 'REB_home', 'REB_away']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Key Game Features')
plt.show()

# Analyzing win distributions (home vs away)
win_counts = games_selected['HOME_TEAM_WINS'].value_counts()

plt.figure(figsize=(6, 6))
sns.barplot(x=win_counts.index, y=win_counts.values, palette='Set2')
plt.title('Home vs Away Wins')
plt.xticks([0, 1], ['Away Wins', 'Home Wins'])
plt.ylabel('Number of Games')
plt.show()

# Trend analysis over season by season 
seasonal_points = games_selected.groupby('SEASON')[['PTS_home', 'PTS_away']].mean()

# Plotting the trend of points over seasons
plt.figure(figsize=(10, 6))
plt.plot(seasonal_points.index, seasonal_points['PTS_home'], label='Home Points', marker='o', color='blue')
plt.plot(seasonal_points.index, seasonal_points['PTS_away'], label='Away Points', marker='o', color='green')
plt.title('Average Points per Game for Home and Away Teams Over Seasons')
plt.xlabel('Season')
plt.ylabel('Average Points')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Importing necessary libraries
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed dataset
X = games_cleaned.drop(columns=['HOME_TEAM_WINS'])  # Features
y = games_cleaned['HOME_TEAM_WINS']  # Target variable (win/loss)

# Step 1: Correlation Analysis
plt.figure(figsize=(10, 8))
corr_matrix = X.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

# Step 2: Variance Thresholding
# Remove low-variance features
variance_selector = VarianceThreshold(threshold=0.01)
X_high_variance = variance_selector.fit_transform(X)

print("Number of features after variance thresholding:", X_high_variance.shape[1])

# Step 3: Recursive Feature Elimination (RFE)
# Use Logistic Regression for RFE
logreg = LogisticRegression(max_iter=1000, random_state=42)
rfe_selector = RFE(logreg, n_features_to_select=5, step=1)  # Selecting the top 5 features
X_rfe = rfe_selector.fit_transform(X_high_variance, y)

# Print selected features
selected_features = X.columns[variance_selector.get_support()][rfe_selector.get_support()]
print("Top 10 selected features:", list(selected_features))

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np

# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42)

# Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
y_pred_rf = rf_grid.best_estimator_.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')
gb_grid.fit(X_train, y_train)
y_pred_gb = gb_grid.best_estimator_.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))

# Linear Regression for Point Margin Prediction
y_margin = games_cleaned['PTS_home'] - games_cleaned['PTS_away']
X_train_margin, X_test_margin, y_train_margin, y_test_margin = train_test_split(X_rfe, y_margin, test_size=0.3, random_state=42)
linreg = LinearRegression()
linreg.fit(X_train_margin, y_train_margin)
y_pred_margin = linreg.predict(X_test_margin)
print("Linear Regression MAE:", mean_absolute_error(y_test_margin, y_pred_margin))
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test_margin, y_pred_margin))

In [None]:
import pandas as pd
import numpy as np

# Assume `games_cleaned` is the preprocessed dataset from earlier steps

# Adding contextual features
games_cleaned['Home_Win_Streak'] = games_cleaned.groupby('HOME_TEAM_ID')['HOME_TEAM_WINS'].cumsum().shift(1).fillna(0)
games_cleaned['Away_Win_Streak'] = games_cleaned.groupby('VISITOR_TEAM_ID')['HOME_TEAM_WINS'].apply(lambda x: (~x).cumsum().shift(1).fillna(0))

# Assuming 'SEASON' column exists to calculate Rest Days
games_cleaned['Rest_Days_Home'] = games_cleaned.groupby('HOME_TEAM_ID')['GAME_DATE_EST'].diff().dt.days.fillna(7)
games_cleaned['Rest_Days_Away'] = games_cleaned.groupby('VISITOR_TEAM_ID')['GAME_DATE_EST'].diff().dt.days.fillna(7)

# Adding advanced basketball metrics
# Effective Field Goal Percentage (eFG%)
games_cleaned['eFG_PCT_home'] = (games_cleaned['FG_PCT_home'] + 0.5 * games_cleaned['FG3_PCT_home']).fillna(0)
games_cleaned['eFG_PCT_away'] = (games_cleaned['FG_PCT_away'] + 0.5 * games_cleaned['FG3_PCT_away']).fillna(0)

# Player Efficiency Rating (PER)
# Using a simplified formula: PER = (PTS + AST + REB + STL + BLK) - (FGA - FGM) - TO
games_cleaned['PER_home'] = (
    games_cleaned['PTS_home'] + games_cleaned['AST_home'] + games_cleaned['REB_home'] +
    games_cleaned['STL_home'] + games_cleaned['BLK_home'] -
    (games_cleaned['FGA_home'] - games_cleaned['FGM_home']) - games_cleaned['TO_home']
).fillna(0)

games_cleaned['PER_away'] = (
    games_cleaned['PTS_away'] + games_cleaned['AST_away'] + games_cleaned['REB_away'] +
    games_cleaned['STL_away'] + games_cleaned['BLK_away'] -
    (games_cleaned['FGA_away'] - games_cleaned['FGM_away']) - games_cleaned['TO_away']
).fillna(0)

# Display the updated dataset's head to verify changes
print(games_cleaned[['Home_Win_Streak', 'Away_Win_Streak', 'Rest_Days_Home', 'Rest_Days_Away', 'eFG_PCT_home', 'eFG_PCT_away', 'PER_home', 'PER_away']].head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the visualization style
sns.set(style="whitegrid", palette="muted", color_codes=True)

# 1. Distribution of Home and Away Win Streaks
plt.figure(figsize=(12, 6))
sns.histplot(games_cleaned['Home_Win_Streak'], kde=True, color='blue', label='Home Win Streak')
sns.histplot(games_cleaned['Away_Win_Streak'], kde=True, color='green', label='Away Win Streak')
plt.title('Distribution of Home and Away Win Streaks')
plt.xlabel('Win Streak')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# 2. Boxplot of Rest Days (Home vs. Away Teams)
plt.figure(figsize=(10, 6))
sns.boxplot(data=games_cleaned[['Rest_Days_Home', 'Rest_Days_Away']], orient='h')
plt.title('Boxplot of Rest Days for Home and Away Teams')
plt.yticks([0, 1], ['Home Rest Days', 'Away Rest Days'])
plt.xlabel('Days')
plt.show()

# 3. Scatter Plot: Effective Field Goal Percentage (eFG%) vs. Points Scored
plt.figure(figsize=(12, 6))
sns.scatterplot(x=games_cleaned['eFG_PCT_home'], y=games_cleaned['PTS_home'], label='Home', color='blue')
sns.scatterplot(x=games_cleaned['eFG_PCT_away'], y=games_cleaned['PTS_away'], label='Away', color='green')
plt.title('Effective Field Goal Percentage (eFG%) vs. Points Scored')
plt.xlabel('eFG%')
plt.ylabel('Points Scored')
plt.legend()
plt.show()

# 4. Violin Plot: Player Efficiency Rating (PER) vs. Home Team Wins
plt.figure(figsize=(10, 6))
sns.violinplot(x=games_cleaned['HOME_TEAM_WINS'], y=games_cleaned['PER_home'], palette='Set2')
plt.title('Player Efficiency Rating (PER) by Home Team Wins')
plt.xlabel('Home Team Wins (0 = Loss, 1 = Win)')
plt.ylabel('PER (Home Team)')
plt.show()

# 5. Heatmap: Correlations of New Features with Game Outcomes
plt.figure(figsize=(10, 8))
new_features = ['Home_Win_Streak', 'Away_Win_Streak', 'Rest_Days_Home', 'Rest_Days_Away', 'eFG_PCT_home', 'eFG_PCT_away', 'PER_home', 'PER_away', 'HOME_TEAM_WINS']
corr_matrix = games_cleaned[new_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of New Features with Game Outcomes')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Splitting the data
X = games_cleaned.drop(columns=['HOME_TEAM_WINS'])  # Features
y = games_cleaned['HOME_TEAM_WINS']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. XGBoost Classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# 2. LightGBM Classifier
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
lgb_grid = GridSearchCV(lgb_model, lgb_params, cv=5, scoring='accuracy', n_jobs=-1)
lgb_grid.fit(X_train, y_train)
lgb_best = lgb_grid.best_estimator_
y_pred_lgb = lgb_best.predict(X_test)

print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb))

# 3. CatBoost Classifier
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_params = {
    'iterations': [50, 100, 200],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
cat_grid = GridSearchCV(cat_model, cat_params, cv=5, scoring='accuracy', n_jobs=-1)
cat_grid.fit(X_train, y_train)
cat_best = cat_grid.best_estimator_
y_pred_cat = cat_best.predict(X_test)

print("CatBoost Classification Report:")
print(classification_report(y_test, y_pred_cat))

# Comparing Accuracy Scores
xgb_acc = accuracy_score(y_test, y_pred_xgb)
lgb_acc = accuracy_score(y_test, y_pred_lgb)
cat_acc = accuracy_score(y_test, y_pred_cat)

print(f"XGBoost Accuracy: {xgb_acc:.4f}")
print(f"LightGBM Accuracy: {lgb_acc:.4f}")
print(f"CatBoost Accuracy: {cat_acc:.4f}")

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Target variable: Point margin
y_margin = games_cleaned['PTS_home'] - games_cleaned['PTS_away']
X_train, X_test, y_train, y_test = train_test_split(X, y_margin, test_size=0.3, random_state=42)

# 1. Random Forest Regression
rf_reg = RandomForestRegressor(random_state=42)
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}
rf_grid = GridSearchCV(rf_reg, rf_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test)

# Random Forest Metrics
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f"Random Forest MAE: {rf_mae:.4f}, RMSE: {rf_rmse:.4f}")

# 2. Gradient Boosting Regression
gb_reg = GradientBoostingRegressor(random_state=42)
gb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
gb_grid = GridSearchCV(gb_reg, gb_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
gb_grid.fit(X_train, y_train)
gb_best = gb_grid.best_estimator_
y_pred_gb = gb_best.predict(X_test)

# Gradient Boosting Metrics
gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_gb))
print(f"Gradient Boosting MAE: {gb_mae:.4f}, RMSE: {gb_rmse:.4f}")

# 3. XGBoost Regression
xgb_reg = XGBRegressor(random_state=42)
xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
xgb_grid = GridSearchCV(xgb_reg, xgb_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)

# XGBoost Metrics
xgb_mae = mean_absolute_error(y_test, y_pred_xgb)
xgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print(f"XGBoost MAE: {xgb_mae:.4f}, RMSE: {xgb_rmse:.4f}")

# Comparing Regression Models
print("\nRegression Model Performance:")
print(f"Random Forest: MAE = {rf_mae:.4f}, RMSE = {rf_rmse:.4f}")
print(f"Gradient Boosting: MAE = {gb_mae:.4f}, RMSE = {gb_rmse:.4f}")
print(f"XGBoost: MAE = {xgb_mae:.4f}, RMSE = {xgb_rmse:.4f}")

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# Splitting the dataset
X = games_cleaned.drop(columns=['HOME_TEAM_WINS'])  # Features
y = games_cleaned['HOME_TEAM_WINS']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Applying SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:")
print(y_train_smote.value_counts())

# Retraining Models on Balanced Data

# 1. Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_smote, y_train_smote)
y_pred_logreg = logreg.predict(X_test)
print("\nLogistic Regression After SMOTE:")
print(classification_report(y_test, y_pred_logreg))

# 2. Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest After SMOTE:")
print(classification_report(y_test, y_pred_rf))

# 3. Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_smote, y_train_smote)
y_pred_gb = gb.predict(X_test)
print("\nGradient Boosting After SMOTE:")
print(classification_report(y_test, y_pred_gb))

# Comparing Accuracy Scores
logreg_acc = accuracy_score(y_test, y_pred_logreg)
rf_acc = accuracy_score(y_test, y_pred_rf)
gb_acc = accuracy_score(y_test, y_pred_gb)

print("\nModel Accuracy After SMOTE:")
print(f"Logistic Regression: {logreg_acc:.4f}")
print(f"Random Forest: {rf_acc:.4f}")
print(f"Gradient Boosting: {gb_acc:.4f}")

In [None]:
import pandas as pd

# Assuming `recent_games` contains recent game data for testing
# Ensure recent_games is preprocessed the same way as the training dataset

# Prepare the recent games dataset
X_recent = recent_games.drop(columns=['HOME_TEAM_WINS', 'PTS_home', 'PTS_away'])  # Features
y_recent_class = recent_games['HOME_TEAM_WINS']  # Actual outcomes for classification
y_recent_margin = recent_games['PTS_home'] - recent_games['PTS_away']  # Actual margins for regression

# Classification Predictions
# Using the best model (e.g., Gradient Boosting from earlier)
y_pred_class = gb_best.predict(X_recent)

# Regression Predictions
# Using the best regression model (e.g., XGBoost Regression)
y_pred_margin = xgb_best.predict(X_recent)

# Evaluate Classification Predictions
from sklearn.metrics import classification_report, accuracy_score

print("Classification Report on Recent Games:")
print(classification_report(y_recent_class, y_pred_class))

accuracy_recent = accuracy_score(y_recent_class, y_pred_class)
print(f"Accuracy on Recent Games: {accuracy_recent:.4f}")

# Evaluate Regression Predictions
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae_recent = mean_absolute_error(y_recent_margin, y_pred_margin)
rmse_recent = np.sqrt(mean_squared_error(y_recent_margin, y_pred_margin))

print("\nRegression Metrics on Recent Games:")
print(f"Mean Absolute Error: {mae_recent:.4f}")
print(f"Root Mean Square Error: {rmse_recent:.4f}")

# Compare Actual vs Predicted
comparison_df = pd.DataFrame({
    'Actual_Outcome': y_recent_class,
    'Predicted_Outcome': y_pred_class,
    'Actual_Margin': y_recent_margin,
    'Predicted_Margin': y_pred_margin
})
print("\nComparison of Actual vs Predicted:")
print(comparison_df.head(10))  # Display first 10 comparisons

In [None]:
# Adding win streak features
games_cleaned['Home_Win_Streak'] = games_cleaned.groupby('HOME_TEAM_ID')['HOME_TEAM_WINS'].cumsum().shift(1).fillna(0)
games_cleaned['Away_Win_Streak'] = games_cleaned.groupby('VISITOR_TEAM_ID')['HOME_TEAM_WINS'].apply(lambda x: (~x).cumsum().shift(1).fillna(0))

# Adding rest days features
games_cleaned['Rest_Days_Home'] = games_cleaned.groupby('HOME_TEAM_ID')['GAME_DATE_EST'].diff().dt.days.fillna(7)
games_cleaned['Rest_Days_Away'] = games_cleaned.groupby('VISITOR_TEAM_ID')['GAME_DATE_EST'].diff().dt.days.fillna(7)

# Effective Field Goal Percentage (eFG%)
games_cleaned['eFG_PCT_home'] = (games_cleaned['FG_PCT_home'] + 0.5 * games_cleaned['FG3_PCT_home']).fillna(0)
games_cleaned['eFG_PCT_away'] = (games_cleaned['FG_PCT_away'] + 0.5 * games_cleaned['FG3_PCT_away']).fillna(0)

# Player Efficiency Rating (PER)
games_cleaned['PER_home'] = (
    games_cleaned['PTS_home'] + games_cleaned['AST_home'] + games_cleaned['REB_home'] +
    games_cleaned['STL_home'] + games_cleaned['BLK_home'] -
    (games_cleaned['FGA_home'] - games_cleaned['FGM_home']) - games_cleaned['TO_home']
).fillna(0)

games_cleaned['PER_away'] = (
    games_cleaned['PTS_away'] + games_cleaned['AST_away'] + games_cleaned['REB_away'] +
    games_cleaned['STL_away'] + games_cleaned['BLK_away'] -
    (games_cleaned['FGA_away'] - games_cleaned['FGM_away']) - games_cleaned['TO_away']
).fillna(0)

In [None]:
# Predicting outcomes and margins for recent games
y_pred_class_recent = xgb_best.predict(X_recent)  # Classification
y_pred_margin_recent = xgb_best.predict(X_recent)  # Regression

# Comparing predictions with actual results
comparison = pd.DataFrame({
    'Actual Outcome': y_recent_class,
    'Predicted Outcome': y_pred_class_recent,
    'Actual Margin': y_recent_margin,
    'Predicted Margin': y_pred_margin_recent
})
print(comparison.head())