In [62]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load all data from the provided CSV files
def load_all_data(csv_files):
    data_frames = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            data_frames.append(df)
        except Exception as e:
            print(f"Error loading file {file}: {e}")
    return pd.concat(data_frames, ignore_index=True)

# Prepare data for training and testing
def prepare_data(data):
    
    data['Year'] = data['Year'].astype(str)

    # Filter rows with missing values in features
    feature_columns = ['NET_RATING', 'OFF_RATING', 'DEF_RATING', 'AST_PCT', 'REB_PCT']
    data = data.dropna(subset=feature_columns + ['W/L%'])

    X = data[feature_columns]
    y = data['W/L%']  # Using win percentage as the target variable

    return X, y, data

# Train the model and evaluate its performance
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a gradient boosting regressor
    # reason for using is that it allows for the combination of multiple team stats (such as net rating, off rating etc.)
    # this can also produce "scores" which can rank teams on the likelihood of winning the championship
    # pretty accurate since it handles regression and classification tasks
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = r2_score(y_test, y_pred)  # R-squared as a measure of fit

    return model, accuracy

# Predict the champion using the trained model
def predict_champion(data, model):
    # Filter for the most recent season
    recent_year = data['Year'].max()
    recent_data = data[data['Year'] == recent_year].copy()

    # Predict success scores
    feature_columns = ['NET_RATING', 'OFF_RATING', 'DEF_RATING', 'AST_PCT', 'REB_PCT']
    recent_data['Predicted_Score'] = model.predict(recent_data[feature_columns])

    # Sort teams by predicted score
    recent_data = recent_data.sort_values(by='Predicted_Score', ascending=False)

    # Predict the champion
    predicted_champion = recent_data.iloc[0]
    return predicted_champion, recent_data

# Main execution
csv_files = [
    "last5seasonsrecords/NBA_2020_Eastern_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2020_Western_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2021_Eastern_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2021_Western_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2022_Eastern_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2022_Western_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2023_Eastern_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2023_Western_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2024_Eastern_Conference_StandingsN_updated.csv",
    "last5seasonsrecords/NBA_2024_Western_Conference_StandingsN_updated.csv"
]

all_data = load_all_data(csv_files)
X, y, processed_data = prepare_data(all_data)

model, accuracy = train_model(X, y)

# Output accuracy
# use R-squared because it measures the how well the regression model predicts y, which is the predicted NBA champion
print(f"Prediction Accuracy (R-squared): {accuracy:.2%}")

# Predict the next NBA champion
champion, ranked_teams = predict_champion(processed_data, model)

# Output the predicted champion and top 5 contenders
print("\nPredicted NBA Champion:")
print(champion[['Team', 'Year', 'Predicted_Score']])

print("\nTop 5 Contenders:")
print(ranked_teams[['Team', 'Year', 'Predicted_Score']].head(5))


Prediction Accuracy (R-squared): 94.63%

Predicted NBA Champion:
Team               Boston Celtics*
Year                       2023-24
Predicted_Score           0.778299
Name: 120, dtype: object

Top 5 Contenders:
                        Team     Year  Predicted_Score
120          Boston Celtics*  2023-24         0.778299
135   Oklahoma City Thunder*  2023-24         0.692819
137  Minnesota Timberwolves*  2023-24         0.684645
136          Denver Nuggets*  2023-24         0.684015
138    Los Angeles Clippers*  2023-24         0.633531
