<a href="https://colab.research.google.com/github/nikhilcoding0013/Basketball-Game-Predictor/blob/main/1B_WhartonDataSciComp_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Data Loading and Cleaning**

In [3]:
import pandas as pd
import numpy as np
import csv
df = pd.read_csv('Copy of games_2022 - games_2022.csv')
df['OT_length_min_tot'] = df['OT_length_min_tot'].map(lambda x: 0 if pd.isna(x) else x)
df['notD1_incomplete']=df['notD1_incomplete'].map({False:0, True:1})
df['win'] = (df['opponent_team_score'] < df['team_score']).astype(int)
print(df.head())
print(df['notD1_incomplete'][2232])

          game_id   game_date                       team  FGA_2  FGM_2  FGA_3  \
0  game_2022_2011  2021-12-30      georgia_lady_bulldogs     50     22     11   
1  game_2022_2011  2021-12-30                 lsu_tigers     50     24     11   
2  game_2022_2012  2021-12-30            missouri_tigers     43     18     15   
3  game_2022_2012  2021-12-30   south_carolina_gamecocks     55     23     21   
4  game_2022_2013  2021-12-30  tennessee_lady_volunteers     41     20     15   

   FGM_3  FTA  FTM  AST  ...  notD1_incomplete  OT_length_min_tot  rest_days  \
0      5    6    3   14  ...                 0                0.0        9.0   
1      4   15    8   15  ...                 0                0.0        3.0   
2      7   16   13   10  ...                 0                5.0        8.0   
3      6    9    5   15  ...                 0                5.0        9.0   
4      4   15   10   16  ...                 0                0.0        3.0   

   attendance  tz_dif_H_E  prev_

In [4]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

def report_nans(df):
    nan_report = df.isna().sum()
    nan_instances = nan_report[nan_report > 0]

    if nan_instances.empty:
        print("No NaN values found in the DataFrame.")
    else:
        print("NaN values found in the following columns:")
        for col, count in nan_instances.items():
            print(f"Column: {col}, NaN Count: {count}")

df.fillna(0, inplace=True)
df.columns

report_nans(df)

No NaN values found in the DataFrame.


## **Collection of ELO ratings**

In [5]:
elo_ratings = {}  # This will store each team's current Elo rating

def get_elo(team):
    if team not in elo_ratings:
        elo_ratings[team] = 1500  # Default starting Elo
    return elo_ratings[team]

# Function which calculates expected win probability for team_A
def expected_win_prob(team_A, team_B):
    """Calculate the probability of Team A winning against Team B based on Elo ratings."""
    R_A = get_elo(team_A)
    R_B = get_elo(team_B)
    return 1 / (1 + 10 ** ((R_B - R_A) / 400))

In [6]:
# Predict necessary stat weights for the updating of ELO

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Merge dataset on game_id to pair teams
df_games = df.merge(df, on="game_id", suffixes=("_A", "_B"))
# Keep only valid matchups (team_A ≠ team_B)
df_games = df_games[df_games["team_A"] != df_games["team_B"]]

# Select necessary columns
columns_needed = [
    "team_A", "team_B", "team_score_A", "team_score_B",
    "FGA_3_A", "FGM_3_A", "FGA_2_A", "FGM_2_A", "FTA_A", "FTM_A",
    "AST_A", "BLK_A", "STL_A", "TOV_A", "DREB_A", "OREB_A",
    "FGA_3_B", "FGM_3_B", "FGA_2_B", "FGM_2_B", "FTA_B", "FTM_B",
    "AST_B", "BLK_B", "STL_B", "TOV_B", "DREB_B", "OREB_B"
]
df_games = df_games[columns_needed].drop_duplicates()

# Calculate differences for each advanced stat
stat_list = ['AST', 'BLK', 'STL', 'TOV', 'DREB', 'OREB']
for stat in stat_list:
    df_games[f'{stat}_diff'] = df_games[f'{stat}_A'] - df_games[f'{stat}_B']
# Determine the result column (1 if Team A won, 0 otherwise)
df_games['result'] = (df_games['team_score_A'] > df_games['team_score_B']).astype(int)
# Select features and target
X = df_games[[f'{stat}_diff' for stat in stat_list]]
y = df_games['result']  # Binary outcome: 1 if Team A wins, 0 if not

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Retrieve the learned coefficients
learned_coeffs = model.coef_[0]
# Map coefficients to the stat_weighting dictionary
predicted_stat_weighting = dict(zip(stat_list, learned_coeffs))
print("Predicted advanced stat weights:", predicted_stat_weighting)

Predicted advanced stat weights: {'AST': np.float64(1.209689781375624), 'BLK': np.float64(0.11042661188571071), 'STL': np.float64(0.1753022616798368), 'TOV': np.float64(-4.651538446135684), 'DREB': np.float64(5.467526587946813), 'OREB': np.float64(-0.16406959836102702)}


In [7]:
# ELO model

# ELO Updater: Uses expected win probability and

def update_elo_with_advanced_stats(team_A, team_B, score_A, score_B,
                                   FGM_3_A, FGA_3_A, FGM_2_A, FGA_2_A, FTM_A, FTA_A, AST_A, BLK_A, STL_A, TOV_A, DREB_A, OREB_A,
                                   FGM_3_B, FGA_3_B, FGM_2_B, FGA_2_B, FTM_B, FTA_B, AST_B, BLK_B, STL_B, TOV_B, DREB_B, OREB_B):
    """Update Elo ratings with shooting efficiency and advanced stats like AST, BLK, STL, TOV, REB."""

    K = 40 # controls rate of change for elo

    P_A = expected_win_prob(team_A, team_B)  # Expected probability of Team A winning
    P_B = 1 - P_A  # Expected probability of Team B winning

    # Determine actual result
    S_A, S_B = (1, 0) if score_A > score_B else (0, 1)  # 1 if win, 0 if loss

    # Calculate shooting efficiencies
    eff_3_A = FGM_3_A / FGA_3_A if FGA_3_A > 0 else 0  # 3PT efficiency
    eff_2_A = FGM_2_A / FGA_2_A if FGA_2_A > 0 else 0  # 2PT efficiency
    eff_ft_A = FTM_A / FTA_A if FTA_A > 0 else 0       # Free throw efficiency

    eff_3_B = FGM_3_B / FGA_3_B if FGA_3_B > 0 else 0
    eff_2_B = FGM_2_B / FGA_2_B if FGA_2_B > 0 else 0
    eff_ft_B = FTM_B / FTA_B if FTA_B > 0 else 0

    # Performance-based adjustments
    performance_factor = 10
    shooting_adjustment_A = performance_factor * (
        (eff_3_A - 0.35) + (eff_2_A - 0.50) + (eff_ft_A - 0.75)
    )
    shooting_adjustment_B = performance_factor * (
        (eff_3_B - 0.35) + (eff_2_B - 0.50) + (eff_ft_B - 0.75)
    )

    # Advanced stat impact
    stat_weighting = predicted_stat_weighting
    advanced_stat_adjustment_A = (
        AST_A * stat_weighting["AST"] +
        BLK_A * stat_weighting["BLK"] +
        STL_A * stat_weighting["STL"] +
        TOV_A * stat_weighting["TOV"] +
        DREB_A * stat_weighting["DREB"] +
        OREB_A * stat_weighting["OREB"]
    )
    advanced_stat_adjustment_B = (
        AST_B * stat_weighting["AST"] +
        BLK_B * stat_weighting["BLK"] +
        STL_B * stat_weighting["STL"] +
        TOV_B * stat_weighting["TOV"] +
        DREB_B * stat_weighting["DREB"] +
        OREB_B * stat_weighting["OREB"]
    )

    # Final Elo update
    elo_ratings[team_A] = get_elo(team_A) + K * (S_A - P_A) + shooting_adjustment_A + advanced_stat_adjustment_A
    elo_ratings[team_B] = get_elo(team_B) + K * (S_B - P_B) + shooting_adjustment_B + advanced_stat_adjustment_B
    return elo_ratings[team_A], elo_ratings[team_B]

In [8]:
# Create dfELO dataframe for Update function
dfELO = df.drop(columns=['game_date', 'F_tech', 'F_personal', 'largest_lead', 'OT_length_min_tot', 'attendance', 'tz_dif_H_E', 'prev_game_dist', 'rest_days', 'home_away', 'home_away_NS',
       'travel_dist', 'TOV_team'])

dfELO.columns
print(dfELO.head())

          game_id                       team  FGA_2  FGM_2  FGA_3  FGM_3  FTA  \
0  game_2022_2011      georgia_lady_bulldogs     50     22     11      5    6   
1  game_2022_2011                 lsu_tigers     50     24     11      4   15   
2  game_2022_2012            missouri_tigers     43     18     15      7   16   
3  game_2022_2012   south_carolina_gamecocks     55     23     21      6    9   
4  game_2022_2013  tennessee_lady_volunteers     41     20     15      4   15   

   FTM  AST  BLK  STL  TOV  DREB  OREB  team_score  opponent_team_score  \
0    3   14    7    7   18    25    11          62                   68   
1    8   15    2   15   14    25    11          68                   62   
2   13   10    1    4    8    31     6          70                   69   
3    5   15    8    3    8    27    20          69                   70   
4   10   16    8    5   15    34    12          62                   44   

   notD1_incomplete  win  
0                 0    0  
1       

In [9]:
final_elo_ratings = {}

# Loop through every two rows (each game has two rows)
for i in range(0, len(df), 2):
    team_A = dfELO.iloc[i]  # First row (Team 1)
    team_B = dfELO.iloc[i+1]  # Second row (Team 2)

    # Call the Elo update function
    new_elo_A, new_elo_B = update_elo_with_advanced_stats(
        team_A['team'], team_B['team'], team_A['team_score'], team_B['team_score'],
        team_A['FGM_3'], team_A['FGA_3'], team_A['FGM_2'], team_A['FGA_2'], team_A['FTM'], team_A['FTA'],
        team_A['AST'], team_A['BLK'], team_A['STL'], team_A['TOV'], team_A['DREB'], team_A['OREB'],
        team_B['FGM_3'], team_B['FGA_3'], team_B['FGM_2'], team_B['FGA_2'], team_B['FTM'], team_B['FTA'],
        team_B['AST'], team_B['BLK'], team_B['STL'], team_B['TOV'], team_B['DREB'], team_B['OREB']
        )
    final_elo_ratings[team_A['team']] = new_elo_A
    final_elo_ratings[team_B['team']] = new_elo_B

# Convert to DataFrame for better visualization
final_elo_df = pd.DataFrame(list(final_elo_ratings.items()), columns=['Team', 'Final_Elo'])
final_elo_df = final_elo_df.sort_values(by='Team', ascending=False)

# Display final Elo ratings
print(final_elo_df)

                                    Team    Final_Elo
339            youngstown_state_penguins  4235.378107
522  young_harris_college_mountain_lions  1575.086879
262                        yale_bulldogs  3663.885995
340                    xavier_musketeers  3747.915453
371                  xavier_la_gold_rush  1499.978866
..                                   ...          ...
64                            akron_zips  3928.109213
34                     air_force_falcons  3762.104896
448         agnes_scott_college_scotties  1366.570866
513         academy_of_art_urban_knights  1637.228798
319           abilene_christian_wildcats  3675.141568

[564 rows x 2 columns]


In [10]:
# Function to get final Elo rating of a given team
def get_team_elo(team_name):
    if team_name in final_elo_ratings:
        return f"Final Elo rating for {team_name}: {final_elo_ratings[team_name]}"
    else:
        return f"Team {team_name} not found in the final Elo ratings."

print(get_team_elo(input("Enter team name: ")))

Enter team name: nc_state_wolfpack
Final Elo rating for nc_state_wolfpack: 5679.124256017891


## **Modeling**

In [12]:
# Create expected win probability column for model
dfn = df.drop(columns=['F_tech', 'F_personal', 'game_id', 'attendance', 'tz_dif_H_E', 'prev_game_dist', 'TOV_team', 'home_away', 'game_date', 'FGA_2', 'FGM_2', 'FGA_3', 'FGM_3', 'FTA', 'FTM',  'AST',  'BLK',  'STL',  'TOV',  'DREB',  'OREB',  'team_score',  'opponent_team_score', 'largest_lead', 'OT_length_min_tot'])
dfn.loc[dfn['home_away_NS'] == -1, 'travel_dist'] = 0
dfn['notD1_incomplete'] = dfn['notD1_incomplete'].astype(int)

expected_probs = []
for i in range(0, len(dfn), 2):
    team_A = dfn.iloc[i]  # First row (Team 1)
    team_B = dfn.iloc[i+1]  # Second row (Team 2)
    P_A = expected_win_prob(team_A['team'], team_B['team'])
    P_B = 1 - P_A
    expected_probs.append(P_A)
    expected_probs.append(P_B)
dfn['expected_win_prob'] = expected_probs


# Modify dataframe for Multiple Linear Regression Model
dfn.drop(columns=['team'], axis=1, inplace=True)
dfn.head()

Unnamed: 0,notD1_incomplete,rest_days,home_away_NS,travel_dist,win,expected_win_prob
0,0,9.0,1,0.0,0,0.023525
1,0,3.0,-1,0.0,1,0.976475
2,0,8.0,1,0.0,1,0.010118
3,0,9.0,-1,0.0,0,0.989882
4,0,3.0,1,0.0,1,0.991286


In [13]:
X = dfn[['notD1_incomplete', 'rest_days', 'home_away_NS', 'travel_dist', 'expected_win_prob']]
y = dfn['win']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_poly, y_train)

In [14]:
import numpy as np
'notD1_incomplete', 'rest_days', 'home_away_NS', 'travel_dist', 'expected_win_prob'

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

## **Metrics and Model Data**

In [28]:
print(lr.n_features_in_)
print(X_train_poly.shape)
print(new_data_poly.shape)

21
(7306, 21)
(1, 21)


In [24]:
# Metrics
print(lr.score(X_test_poly, y_test))
print(lr.score(X_train_poly, y_train))
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = lr.predict(X_test_poly)

print("MSE: " + str(mean_squared_error(y_test, y_pred)))
print("R2: " + str(r2_score(y_test, y_pred)))

0.3320610336267277
0.325716974214301
MSE: 0.16695471321609934
R2: 0.3320610336267277


In [23]:
lr.coef_

array([ 0.00000000e+00, -1.39185457e-01, -7.13398321e-03,  9.91257839e-02,
       -3.86452513e-05,  6.12483662e-01, -1.39185453e-01,  1.61559655e-12,
       -1.28646876e-01, -1.04899422e-12,  3.20845362e+01,  2.30020427e-04,
       -4.94274958e-03, -2.80637243e-06,  4.13572967e-03, -6.64062508e-02,
        1.31567072e-03, -4.79213075e-03,  1.20205166e-08, -4.80909145e-05,
       -1.25926430e-02])

## **Predictions**

In [45]:
df_pred = pd.read_csv('East_Games_Pred.csv')
df_pred.head()

Unnamed: 0,team_home,team_away,home_away_NS,rest_days,travel_dist
0,rhode_island_rams,north_carolina_tar_heels,1,6,0
1,nc_state_wolfpack,rhode_island_rams,-1,7,1440
2,nc_state_wolfpack,north_carolina_tar_heels,-1,7,1440
3,liberty_flames,bucknell_bison,-1,7,255
4,drexel_dragons,delaware_blue_hens,1,11,0


In [46]:
# fix prediction dataframe
df_pred['notD1_incomplete'] = 0
df_pred['expected_win_prob'] = df_pred.apply(
    lambda row: expected_win_prob(row['team_home'], row['team_away']),
    axis=1)

# organize for "new_data_poly"
X_new = df_pred[[
    'notD1_incomplete',
    'rest_days',
    'home_away_NS',
    'travel_dist',
    'expected_win_prob'
]]

In [47]:
# Loop over rows and predict
for idx, row in X_new.iterrows():
    # Make a single-row 2D array
    new_data = np.array([[
        row['notD1_incomplete'],
        row['rest_days'],
        row['home_away_NS'],
        row['travel_dist'],
        row['expected_win_prob']
    ]])

    # Transform to polynomial features
    new_data_poly = poly.transform(new_data)

    # Predict
    predicted_win_percentage = lr.predict(new_data_poly)
    win_probability = sigmoid(predicted_win_percentage)

    print(f"Game {df_pred.loc[idx, 'team_home']}: Predicted Win Probability: {win_probability[0]:.4f}")

Game rhode_island_rams: Predicted Win Probability: 0.5970
Game nc_state_wolfpack: Predicted Win Probability: 0.2183
Game nc_state_wolfpack: Predicted Win Probability: 0.2181
Game liberty_flames: Predicted Win Probability: 0.4620
Game drexel_dragons: Predicted Win Probability: 0.6642
Game massachusetts_minutewomen: Predicted Win Probability: 0.6353
Game buffalo_bulls: Predicted Win Probability: 0.6601
Game fairfield_stags: Predicted Win Probability: 0.5980
Game uconn_huskies: Predicted Win Probability: 0.6763
Game american_university_eagles: Predicted Win Probability: 0.4315


