In [82]:
import pandas as pd
import numpy as np

### Preprocess Data

In [83]:
df = pd.read_csv('./game_results.csv')
df

Unnamed: 0.1,Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win
0,0,2014,1,South Carolina,28,Texas A&M,52,2014-08-28 18:00:00,False
1,1,2014,1,Texas A&M,52,South Carolina,28,2014-08-28 18:00:00,True
2,2,2014,1,Akron,41,Howard,0,2014-08-28 19:00:00,True
3,3,2014,1,Presbyterian,3,Northern Illinois,55,2014-08-28 19:00:00,False
4,4,2014,1,Eastern Illinois,20,Minnesota,42,2014-08-28 19:00:00,False
...,...,...,...,...,...,...,...,...,...
16527,16527,2024,14,Air Force,22,Nevada,19,2024-11-23 22:30:00,True
16528,16528,2024,15,Buffalo,43,Kent State,7,2024-11-26 19:00:00,True
16529,16529,2024,15,Akron,21,Toledo,14,2024-11-26 19:00:00,True
16530,16530,2024,15,Toledo,14,Akron,21,2024-11-26 19:00:00,False


In [84]:
df = df.drop(['Unnamed: 0'], axis=1)

In [85]:
# Add a Target column (shows if they win the next week)
def add_target(team):
    team['Target'] = team["Win"].shift(-1)
    return team

df = df.groupby("Team", group_keys=False).apply(add_target)

In [86]:
# Check a certain team
df[df['Team'] == "Kennesaw State"]

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target
6804,2018,2,Kennesaw State,20,Georgia State,24,2018-08-30 19:00:00,False,False
9765,2021,3,Kennesaw State,17,Georgia Tech,45,2021-09-11 12:00:00,False,False
11568,2022,3,Kennesaw State,10,Cincinnati,63,2022-09-10 15:30:00,False,False
14345,2023,11,Kennesaw State,21,Sam Houston,24,2023-11-04 13:00:00,False,False
15026,2024,2,Kennesaw State,16,Texas-San Antonio,28,2024-08-31 15:30:00,False,False
15226,2024,3,Kennesaw State,10,Louisiana,34,2024-09-07 19:00:00,False,False
15383,2024,4,Kennesaw State,10,San Jose State,31,2024-09-14 19:00:00,False,False
15621,2024,6,Kennesaw State,13,Tennessee-Martin,24,2024-09-28 18:00:00,False,False
15661,2024,7,Kennesaw State,24,Jacksonville State,63,2024-10-04 19:00:00,False,False
15861,2024,9,Kennesaw State,5,Middle Tennessee State,14,2024-10-15 20:00:00,False,True


In [87]:
# Replace NaN values with 2 instead of Nan
df['Target'][pd.isnull(df['Target'])] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Target'][pd.isnull(df['Target'])] = 2


In [88]:
# Convert Target values to numbers (0 or 1) 
df['Target'] = df['Target'].astype(int, errors='ignore')
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target
0,2014,1,South Carolina,28,Texas A&M,52,2014-08-28 18:00:00,False,1
1,2014,1,Texas A&M,52,South Carolina,28,2014-08-28 18:00:00,True,1
2,2014,1,Akron,41,Howard,0,2014-08-28 19:00:00,True,0
3,2014,1,Presbyterian,3,Northern Illinois,55,2014-08-28 19:00:00,False,0
4,2014,1,Eastern Illinois,20,Minnesota,42,2014-08-28 19:00:00,False,0
...,...,...,...,...,...,...,...,...,...
16527,2024,14,Air Force,22,Nevada,19,2024-11-23 22:30:00,True,2
16528,2024,15,Buffalo,43,Kent State,7,2024-11-26 19:00:00,True,2
16529,2024,15,Akron,21,Toledo,14,2024-11-26 19:00:00,True,2
16530,2024,15,Toledo,14,Akron,21,2024-11-26 19:00:00,False,2


In [89]:
# Check the value counts of target
df['Target'].value_counts()

1    8173
0    8112
2     247
Name: Target, dtype: int64

In [90]:
# Check for nulls
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls

Season            0
Wk                0
Team              0
Points Scored     0
Opponent          0
Points Allowed    0
DateTime          0
Win               0
Target            0
dtype: int64

### Predicting Winner
#### Ridge Regression Model

In [91]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score


rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=5, direction='forward', cv=split)

In [92]:
# Columns we do not scale 
removed_columns = ['Season', 'Wk', 'Day', 'Team', 'Opponent', 'DateTime', 'Win', 'Target']
# Columns we do scale (points scored and points allowed)
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [93]:
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target
0,2014,1,South Carolina,0.321839,Texas A&M,0.597701,2014-08-28 18:00:00,False,1
1,2014,1,Texas A&M,0.597701,South Carolina,0.321839,2014-08-28 18:00:00,True,1
2,2014,1,Akron,0.471264,Howard,0.000000,2014-08-28 19:00:00,True,0
3,2014,1,Presbyterian,0.034483,Northern Illinois,0.632184,2014-08-28 19:00:00,False,0
4,2014,1,Eastern Illinois,0.229885,Minnesota,0.482759,2014-08-28 19:00:00,False,0
...,...,...,...,...,...,...,...,...,...
16527,2024,14,Air Force,0.252874,Nevada,0.218391,2024-11-23 22:30:00,True,2
16528,2024,15,Buffalo,0.494253,Kent State,0.080460,2024-11-26 19:00:00,True,2
16529,2024,15,Akron,0.241379,Toledo,0.160920,2024-11-26 19:00:00,True,2
16530,2024,15,Toledo,0.160920,Akron,0.241379,2024-11-26 19:00:00,False,2


In [94]:
# Select predictors 
predictors = list(df[['Points Scored', 'Points Allowed']])
predictors

['Points Scored', 'Points Allowed']

In [95]:
# Create a function to make predictions
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    # All seasons in our dataset
    seasons = sorted(data["Season"].unique()) 

    for i in range(start, len(seasons), step):
        season = seasons[i]

        # Train on all data before our current season
        train = data[data["Season"] < season]
        # Test on our current season data
        test = data[data["Season"] == season]

        # Fit model 
        model.fit(train[predictors], train['Target'])

        # Generate predictions
        preds = model.predict(test[predictors])
        # Convert to pandas series instead of numpy array
        preds = pd.Series(preds, index=test.index)

        # Combine the Target and Prediction values
        combined = pd.concat([test['Target'], preds], axis=1)
        combined.columns = ['Actual', 'Prediction']
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [96]:
predictions = backtest(df, rr, predictors)
predictions

Unnamed: 0,Actual,Prediction
3362,0,1
3363,0,0
3364,1,1
3365,1,0
3366,1,0
...,...,...
16527,2,1
16528,2,1
16529,2,1
16530,2,0


In [97]:
# Accuracy of our model
predictions = predictions[predictions["Actual"] != 2]
accuracy_score(predictions['Actual'], predictions['Prediction'])

0.5795419374806562

### Feature Engineering to Improve Model

In [98]:
# Use rolling averages to improve model
df_rolling = df[['Season', 'Team', 'Points Scored', 'Points Allowed', 'Win']]
df_rolling

Unnamed: 0,Season,Team,Points Scored,Points Allowed,Win
0,2014,South Carolina,0.321839,0.597701,False
1,2014,Texas A&M,0.597701,0.321839,True
2,2014,Akron,0.471264,0.000000,True
3,2014,Presbyterian,0.034483,0.632184,False
4,2014,Eastern Illinois,0.229885,0.482759,False
...,...,...,...,...,...
16527,2024,Air Force,0.252874,0.218391,True
16528,2024,Buffalo,0.494253,0.080460,True
16529,2024,Akron,0.241379,0.160920,True
16530,2024,Toledo,0.160920,0.241379,False


In [99]:
def find_team_averages(team):
    # Group rows by previous 2 rows + current row averages
    rolling = team[['Points Scored', 'Points Allowed']].rolling(3).mean()
    return rolling

df_rolling = df_rolling.groupby(['Season', 'Team'], group_keys=False).apply(find_team_averages)
df_rolling

Unnamed: 0,Points Scored,Points Allowed
0,,
1,,
2,,
3,,
4,,
...,...,...
16527,0.329502,0.180077
16528,0.501916,0.287356
16529,0.287356,0.229885
16530,0.222222,0.210728


In [100]:
# Rename the rolling cols so we can merge with original df
rolling_cols = [f'{col}_3' for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# Concatenate
df = pd.concat([df, df_rolling], axis=1)
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3,Points Allowed_3
0,2014,1,South Carolina,0.321839,Texas A&M,0.597701,2014-08-28 18:00:00,False,1,,
1,2014,1,Texas A&M,0.597701,South Carolina,0.321839,2014-08-28 18:00:00,True,1,,
2,2014,1,Akron,0.471264,Howard,0.000000,2014-08-28 19:00:00,True,0,,
3,2014,1,Presbyterian,0.034483,Northern Illinois,0.632184,2014-08-28 19:00:00,False,0,,
4,2014,1,Eastern Illinois,0.229885,Minnesota,0.482759,2014-08-28 19:00:00,False,0,,
...,...,...,...,...,...,...,...,...,...,...,...
16527,2024,14,Air Force,0.252874,Nevada,0.218391,2024-11-23 22:30:00,True,2,0.329502,0.180077
16528,2024,15,Buffalo,0.494253,Kent State,0.080460,2024-11-26 19:00:00,True,2,0.501916,0.287356
16529,2024,15,Akron,0.241379,Toledo,0.160920,2024-11-26 19:00:00,True,2,0.287356,0.229885
16530,2024,15,Toledo,0.160920,Akron,0.241379,2024-11-26 19:00:00,False,2,0.222222,0.210728


In [101]:
# Drop rows with missing rows
df = df.dropna()
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3,Points Allowed_3
300,2014,3,Louisiana Tech,0.482759,North Texas,0.241379,2014-09-11 20:00:00,True,0,0.406130,0.340996
302,2014,3,Houston,0.287356,Brigham Young,0.379310,2014-09-11 21:00:00,False,1,0.302682,0.229885
303,2014,3,Brigham Young,0.379310,Houston,0.287356,2014-09-11 21:00:00,True,1,0.417625,0.160920
306,2014,3,Buffalo,0.241379,Baylor,0.724138,2014-09-12 20:00:00,False,1,0.375479,0.528736
308,2014,3,Boise State,0.436782,Connecticut,0.241379,2014-09-13 12:00:00,True,1,0.337165,0.306513
...,...,...,...,...,...,...,...,...,...,...,...
16527,2024,14,Air Force,0.252874,Nevada,0.218391,2024-11-23 22:30:00,True,2,0.329502,0.180077
16528,2024,15,Buffalo,0.494253,Kent State,0.080460,2024-11-26 19:00:00,True,2,0.501916,0.287356
16529,2024,15,Akron,0.241379,Toledo,0.160920,2024-11-26 19:00:00,True,2,0.287356,0.229885
16530,2024,15,Toledo,0.160920,Akron,0.241379,2024-11-26 19:00:00,False,2,0.222222,0.210728


In [102]:
# Add who next opponent is to improve algorithm
# Shift to the next value given the team and column name
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# Add a column to the dataframe applying the shift column function
def add_col(df, col_name):
    return df.groupby('Team', group_keys=False).apply(lambda x: shift_col(x, col_name))

# Add a next opponent and next date column
df['Next Opponent'] = add_col(df, 'Opponent')
df['Next Date'] = add_col(df, 'DateTime')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Opponent'] = add_col(df, 'Opponent')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Date'] = add_col(df, 'DateTime')


In [103]:
# Make a copy of the dataframe to avoid issues
df = df.copy()
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3,Points Allowed_3,Next Opponent,Next Date
300,2014,3,Louisiana Tech,0.482759,North Texas,0.241379,2014-09-11 20:00:00,True,0,0.406130,0.340996,Northwestern State,2014-09-20 19:00:00
302,2014,3,Houston,0.287356,Brigham Young,0.379310,2014-09-11 21:00:00,False,1,0.302682,0.229885,Nevada-Las Vegas,2014-09-20 20:00:00
303,2014,3,Brigham Young,0.379310,Houston,0.287356,2014-09-11 21:00:00,True,1,0.417625,0.160920,Virginia,2014-09-20 15:30:00
306,2014,3,Buffalo,0.241379,Baylor,0.724138,2014-09-12 20:00:00,False,1,0.375479,0.528736,Norfolk State,2014-09-20 15:30:00
308,2014,3,Boise State,0.436782,Connecticut,0.241379,2014-09-13 12:00:00,True,1,0.337165,0.306513,Louisiana,2014-09-20 22:40:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16527,2024,14,Air Force,0.252874,Nevada,0.218391,2024-11-23 22:30:00,True,2,0.329502,0.180077,,
16528,2024,15,Buffalo,0.494253,Kent State,0.080460,2024-11-26 19:00:00,True,2,0.501916,0.287356,,
16529,2024,15,Akron,0.241379,Toledo,0.160920,2024-11-26 19:00:00,True,2,0.287356,0.229885,,
16530,2024,15,Toledo,0.160920,Akron,0.241379,2024-11-26 19:00:00,False,2,0.222222,0.210728,,


In [104]:
# Get rolling data for opponent 
full = df.merge(df[rolling_cols + ['Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )
full

Unnamed: 0,Season,Wk,Team_x,Points Scored,Opponent,Points Allowed,DateTime,Win,Target,Points Scored_3_x,Points Allowed_3_x,Next Opponent_x,Next Date,Points Scored_3_y,Points Allowed_3_y,Next Opponent_y,Team_y
0,2014,3,Houston,0.287356,Brigham Young,0.379310,2014-09-11 21:00:00,False,1,0.302682,0.229885,Nevada-Las Vegas,2014-09-20 20:00:00,0.229885,0.452107,Houston,Nevada-Las Vegas
1,2014,3,Boise State,0.436782,Connecticut,0.241379,2014-09-13 12:00:00,True,1,0.337165,0.306513,Louisiana,2014-09-20 22:40:00,0.306513,0.421456,Boise State,Louisiana
2,2014,3,Massachusetts,0.356322,Vanderbilt,0.390805,2014-09-13 12:00:00,False,0,0.291188,0.402299,Penn State,2014-09-20 16:00:00,0.229885,0.141762,Massachusetts,Penn State
3,2014,3,Kent State,0.000000,Ohio State,0.758621,2014-09-13 12:00:00,False,0,0.103448,0.406130,Virginia,2014-09-27 15:30:00,0.291188,0.344828,Kent State,Virginia
4,2014,3,Virginia Tech,0.241379,East Carolina,0.321839,2014-09-13 12:00:00,False,0,0.344828,0.222222,Georgia Tech,2014-09-20 12:00:00,0.452107,0.298851,Virginia Tech,Georgia Tech
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12225,2024,13,Nevada-Las Vegas,0.471264,San Diego State,0.229885,2024-11-16 22:30:00,True,1,0.360153,0.291188,San Jose State,2024-11-22 22:00:00,0.210728,0.337165,Nevada-Las Vegas,San Jose State
12226,2024,14,Akron,0.436782,Kent State,0.195402,2024-11-19 19:00:00,True,1,0.321839,0.333333,Toledo,2024-11-26 19:00:00,0.279693,0.237548,Akron,Toledo
12227,2024,14,Kent State,0.195402,Akron,0.436782,2024-11-19 19:00:00,False,0,0.091954,0.432950,Buffalo,2024-11-26 19:00:00,0.494253,0.375479,Kent State,Buffalo
12228,2024,14,Toledo,0.080460,Ohio,0.275862,2024-11-20 19:00:00,False,0,0.279693,0.237548,Akron,2024-11-26 19:00:00,0.321839,0.333333,Toledo,Akron


In [105]:
# Visualize the merge
full[['Team_x', 'Next Opponent_x', 'Team_y', 'Next Opponent_y', 'Next Date']]

Unnamed: 0,Team_x,Next Opponent_x,Team_y,Next Opponent_y,Next Date
0,Houston,Nevada-Las Vegas,Nevada-Las Vegas,Houston,2014-09-20 20:00:00
1,Boise State,Louisiana,Louisiana,Boise State,2014-09-20 22:40:00
2,Massachusetts,Penn State,Penn State,Massachusetts,2014-09-20 16:00:00
3,Kent State,Virginia,Virginia,Kent State,2014-09-27 15:30:00
4,Virginia Tech,Georgia Tech,Georgia Tech,Virginia Tech,2014-09-20 12:00:00
...,...,...,...,...,...
12225,Nevada-Las Vegas,San Jose State,San Jose State,Nevada-Las Vegas,2024-11-22 22:00:00
12226,Akron,Toledo,Toledo,Akron,2024-11-26 19:00:00
12227,Kent State,Buffalo,Buffalo,Kent State,2024-11-26 19:00:00
12228,Toledo,Akron,Akron,Toledo,2024-11-26 19:00:00


In [106]:
# Use sequential feature selector to find features
# Get columns that have the object datatype (our model cannot use them)
removed_columns = list(full.columns[full.dtypes == 'object']) + removed_columns
removed_columns

['Team_x',
 'Opponent',
 'DateTime',
 'Next Opponent_x',
 'Next Date',
 'Next Opponent_y',
 'Team_y',
 'Season',
 'Wk',
 'Day',
 'Team',
 'Opponent',
 'DateTime',
 'Win',
 'Target']

In [107]:
# Get columns that are not in our removed_columns list
selected_columns = full.columns[~full.columns.isin(removed_columns)]
selected_columns

Index(['Points Scored', 'Points Allowed', 'Points Scored_3_x',
       'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y'],
      dtype='object')

In [108]:
# Predictions
predictions = backtest(full, rr, selected_columns)
accuracy_score(predictions['Actual'], predictions['Prediction'])
predictions

Unnamed: 0,Actual,Prediction
2530,0,0
2531,1,1
2532,1,1
2533,0,0
2534,1,1
...,...,...
12225,1,1
12226,1,0
12227,0,0
12228,0,1


In [109]:
# Accuracy of our model
predictions = predictions[predictions["Actual"] != 2]
accuracy_score(predictions['Actual'], predictions['Prediction'])

0.6408247422680412

In [110]:
# Predict future winners function
def predict_winner(model, data, team1, team2, date, predictors):
    # Get latest statistics for each team
    team1_stats = data[data['Team'] == team1].tail(1)
    team2_stats = data[data['Team'] == team2].tail(1)

    # Combine datasets
    combined = pd.concat([team1_stats, team2_stats], ignore_index=True)
    combined = combined.copy()

    # Add next opponent and next date to the dataset
    combined['Next Opponent'][0] = team2
    combined['Next Opponent'][1] = team1
    combined['Next Date'] = date

    predict_data = combined.merge(combined[['Points Scored_3', 'Points Allowed_3', 'Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )

    # Make the prediction
    prediction = model.predict(predict_data[predictors])
    
    # Get predicted winner and loser
    winner = team1 if prediction[0] == 1 else team2
    loser = team2 if prediction[0] == 1 else team1
    return {'Winner': winner, 'Loser': loser}

# Change team names and then display predicted winner and loser
winner_loser = predict_winner(rr, df, 'Virginia Tech', 'Virginia', '2024-12-30 20:00:00', selected_columns)
winner_loser

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][0] = team2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][1] = team1


{'Winner': 'Virginia Tech', 'Loser': 'Virginia'}

### Predicting Scores

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [112]:
# Import dataset
df = pd.read_csv('./game_results.csv')
df = df.drop(['Unnamed: 0'], axis=1)
# Convert Win Boolean to a numeric (0 or 1) 
df['Win'] = df['Win'].astype(int, errors='ignore')
df.tail()

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win
16527,2024,14,Air Force,22,Nevada,19,2024-11-23 22:30:00,1
16528,2024,15,Buffalo,43,Kent State,7,2024-11-26 19:00:00,1
16529,2024,15,Akron,21,Toledo,14,2024-11-26 19:00:00,1
16530,2024,15,Toledo,14,Akron,21,2024-11-26 19:00:00,0
16531,2024,15,Kent State,7,Buffalo,43,2024-11-26 19:00:00,0


In [113]:
# Add a Next Score column (shows if the score of the next week)
def add_next_score(team):
    team['Next Score'] = team['Points Scored'].shift(-1)
    return team

df = df.groupby("Team", group_keys=False).apply(add_next_score)

# Check a certain team
df[df['Team'] == "Virginia Tech"]

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score
104,2014,1,Virginia Tech,34,William & Mary,9,2014-08-30 16:00:00,1,35.0
281,2014,2,Virginia Tech,35,Ohio State,21,2014-09-06 20:00:00,1,21.0
315,2014,3,Virginia Tech,21,East Carolina,28,2014-09-13 12:00:00,0,24.0
425,2014,4,Virginia Tech,24,Georgia Tech,27,2014-09-20 12:00:00,0,35.0
554,2014,5,Virginia Tech,35,Western Michigan,17,2014-09-27 12:30:00,1,34.0
...,...,...,...,...,...,...,...,...,...
15871,2024,9,Virginia Tech,42,Boston College,21,2024-10-17 19:30:00,1,21.0
16009,2024,10,Virginia Tech,21,Georgia Tech,6,2024-10-26 12:00:00,1,31.0
16109,2024,11,Virginia Tech,31,Syracuse,38,2024-11-02 12:00:00,0,14.0
16242,2024,12,Virginia Tech,14,Clemson,24,2024-11-09 15:30:00,0,28.0


In [114]:
# Replace NaN values with 2 instead of Nan
df['Next Score'][pd.isnull(df['Next Score'])] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Score'][pd.isnull(df['Next Score'])] = -1


In [115]:
# Use rolling averages to improve model
df_rolling = df[['Season', 'Team', 'Points Scored', 'Points Allowed', 'Win']]

def find_team_averages(team):
    # Group rows by previous 2 rows + current row averages
    rolling = team[['Points Scored', 'Points Allowed']].rolling(3).mean()
    return rolling

df_rolling = df_rolling.groupby(['Season', 'Team'], group_keys=False).apply(find_team_averages)

# Rename the rolling cols so we can merge with original df
rolling_cols = [f'{col}_3' for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# Concatenate
df = pd.concat([df, df_rolling], axis=1)
# Drop rows with missing rows
df = df.dropna()
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Points Scored_3,Points Allowed_3
300,2014,3,Louisiana Tech,42,North Texas,21,2014-09-11 20:00:00,1,27.0,35.333333,29.666667
302,2014,3,Houston,25,Brigham Young,33,2014-09-11 21:00:00,0,47.0,26.333333,20.000000
303,2014,3,Brigham Young,33,Houston,25,2014-09-11 21:00:00,1,41.0,36.333333,14.000000
306,2014,3,Buffalo,21,Baylor,63,2014-09-12 20:00:00,0,36.0,32.666667,46.000000
308,2014,3,Boise State,38,Connecticut,21,2014-09-13 12:00:00,1,34.0,29.333333,26.666667
...,...,...,...,...,...,...,...,...,...,...,...
16527,2024,14,Air Force,22,Nevada,19,2024-11-23 22:30:00,1,-1.0,28.666667,15.666667
16528,2024,15,Buffalo,43,Kent State,7,2024-11-26 19:00:00,1,-1.0,43.666667,25.000000
16529,2024,15,Akron,21,Toledo,14,2024-11-26 19:00:00,1,-1.0,25.000000,20.000000
16530,2024,15,Toledo,14,Akron,21,2024-11-26 19:00:00,0,-1.0,19.333333,18.333333


In [116]:
# Shift to the next value given the team and column name
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# Add a column to the dataframe applying the shift column function
def add_col(df, col_name):
    return df.groupby('Team', group_keys=False).apply(lambda x: shift_col(x, col_name))

# Add a next opponent and next date column
df['Next Opponent'] = add_col(df, 'Opponent')
df['Next Date'] = add_col(df, 'DateTime')
df

Unnamed: 0,Season,Wk,Team,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Points Scored_3,Points Allowed_3,Next Opponent,Next Date
300,2014,3,Louisiana Tech,42,North Texas,21,2014-09-11 20:00:00,1,27.0,35.333333,29.666667,Northwestern State,2014-09-20 19:00:00
302,2014,3,Houston,25,Brigham Young,33,2014-09-11 21:00:00,0,47.0,26.333333,20.000000,Nevada-Las Vegas,2014-09-20 20:00:00
303,2014,3,Brigham Young,33,Houston,25,2014-09-11 21:00:00,1,41.0,36.333333,14.000000,Virginia,2014-09-20 15:30:00
306,2014,3,Buffalo,21,Baylor,63,2014-09-12 20:00:00,0,36.0,32.666667,46.000000,Norfolk State,2014-09-20 15:30:00
308,2014,3,Boise State,38,Connecticut,21,2014-09-13 12:00:00,1,34.0,29.333333,26.666667,Louisiana,2014-09-20 22:40:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16527,2024,14,Air Force,22,Nevada,19,2024-11-23 22:30:00,1,-1.0,28.666667,15.666667,,
16528,2024,15,Buffalo,43,Kent State,7,2024-11-26 19:00:00,1,-1.0,43.666667,25.000000,,
16529,2024,15,Akron,21,Toledo,14,2024-11-26 19:00:00,1,-1.0,25.000000,20.000000,,
16530,2024,15,Toledo,14,Akron,21,2024-11-26 19:00:00,0,-1.0,19.333333,18.333333,,


In [117]:
# Get rolling data for opponent 
full = df.merge(df[rolling_cols + ['Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )
full

Unnamed: 0,Season,Wk,Team_x,Points Scored,Opponent,Points Allowed,DateTime,Win,Next Score,Points Scored_3_x,Points Allowed_3_x,Next Opponent_x,Next Date,Points Scored_3_y,Points Allowed_3_y,Next Opponent_y,Team_y
0,2014,3,Houston,25,Brigham Young,33,2014-09-11 21:00:00,0,47.0,26.333333,20.000000,Nevada-Las Vegas,2014-09-20 20:00:00,20.000000,39.333333,Houston,Nevada-Las Vegas
1,2014,3,Boise State,38,Connecticut,21,2014-09-13 12:00:00,1,34.0,29.333333,26.666667,Louisiana,2014-09-20 22:40:00,26.666667,36.666667,Boise State,Louisiana
2,2014,3,Massachusetts,31,Vanderbilt,34,2014-09-13 12:00:00,0,7.0,25.333333,35.000000,Penn State,2014-09-20 16:00:00,20.000000,12.333333,Massachusetts,Penn State
3,2014,3,Kent State,0,Ohio State,66,2014-09-13 12:00:00,0,13.0,9.000000,35.333333,Virginia,2014-09-27 15:30:00,25.333333,30.000000,Kent State,Virginia
4,2014,3,Virginia Tech,21,East Carolina,28,2014-09-13 12:00:00,0,24.0,30.000000,19.333333,Georgia Tech,2014-09-20 12:00:00,39.333333,26.000000,Virginia Tech,Georgia Tech
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12225,2024,13,Nevada-Las Vegas,41,San Diego State,20,2024-11-16 22:30:00,1,27.0,31.333333,25.333333,San Jose State,2024-11-22 22:00:00,18.333333,29.333333,Nevada-Las Vegas,San Jose State
12226,2024,14,Akron,38,Kent State,17,2024-11-19 19:00:00,1,21.0,28.000000,29.000000,Toledo,2024-11-26 19:00:00,24.333333,20.666667,Akron,Toledo
12227,2024,14,Kent State,17,Akron,38,2024-11-19 19:00:00,0,7.0,8.000000,37.666667,Buffalo,2024-11-26 19:00:00,43.000000,32.666667,Kent State,Buffalo
12228,2024,14,Toledo,7,Ohio,24,2024-11-20 19:00:00,0,14.0,24.333333,20.666667,Akron,2024-11-26 19:00:00,28.000000,29.000000,Toledo,Akron


In [118]:
# Get columns with object datatype (model can't use them)
removed_columns = list(full.columns[full.dtypes == 'object'])
removed_columns

['Team_x',
 'Opponent',
 'DateTime',
 'Next Opponent_x',
 'Next Date',
 'Next Opponent_y',
 'Team_y']

In [119]:
# Features and labels 
X = full[['Points Scored', 'Points Allowed','Win','Points Scored_3_x', 'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y']]
y = full['Next Score']
X

Unnamed: 0,Points Scored,Points Allowed,Win,Points Scored_3_x,Points Allowed_3_x,Points Scored_3_y,Points Allowed_3_y
0,25,33,0,26.333333,20.000000,20.000000,39.333333
1,38,21,1,29.333333,26.666667,26.666667,36.666667
2,31,34,0,25.333333,35.000000,20.000000,12.333333
3,0,66,0,9.000000,35.333333,25.333333,30.000000
4,21,28,0,30.000000,19.333333,39.333333,26.000000
...,...,...,...,...,...,...,...
12225,41,20,1,31.333333,25.333333,18.333333,29.333333
12226,38,17,1,28.000000,29.000000,24.333333,20.666667
12227,17,38,0,8.000000,37.666667,43.000000,32.666667
12228,7,24,0,24.333333,20.666667,28.000000,29.000000


In [120]:
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set hyperparameters
params = {
    'objective': 'reg:squarederror', # Regression task
    'eval_metric': 'rmse', # Use root mean squared error to evaluate
    'eta': 0.1, # Learning rate
    'max_depth': 3,  # Maximum depth of trees
    'subsample': 0.8,  # Fraction of samples used for training each tree
    'colsample_bytree': 0.8,  # Fraction of features used for training each tree
    'seed': 42
}

In [122]:
# Train model
xgb_model = xgb.train(params, dtrain, 100)

In [123]:
# Make predictions on test set
predictions = xgb_model.predict(dtest)
predictions

array([32.590675, 30.08016 , 27.332724, ..., 32.779552, 23.984514,
       21.321405], dtype=float32)

In [124]:
# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 13.045175634172432


In [130]:
score_predictors = ['Points Scored', 'Points Allowed', 'Win', 'Points Scored_3_x', 'Points Allowed_3_x', 'Points Scored_3_y', 'Points Allowed_3_y']
# Predict future scores function
def predict_score(model, data, team1, team2, date, predictors):
    # Get latest statistics for each team
    team1_stats = data[data['Team'] == team1].tail(1)
    team2_stats = data[data['Team'] == team2].tail(1)

    # Combine datasets
    combined = pd.concat([team1_stats, team2_stats], ignore_index=True)
    combined = combined.copy()

    # Add next opponent and next date to the dataset
    combined['Next Opponent'][0] = team2
    combined['Next Opponent'][1] = team1
    combined['Next Date'] = date

    predict_data = combined.merge(combined[['Points Scored_3', 'Points Allowed_3', 'Next Opponent', 'Next Date', 'Team']], 
                left_on=['Team', 'Next Date'], 
                right_on=['Next Opponent', 'Next Date']
                )

    # Create DMatrix for XGBoost
    dmatrix = xgb.DMatrix(predict_data[predictors])
    # Make the prediction
    prediction = model.predict(dmatrix)
    
    # Get predicted score
    team1_score = prediction[0]
    team2_score = prediction[1]

    return {'Team1': {'Name': team1, 'Predicted_Score': round(team1_score)},
        'Team2': {'Name': team2, 'Predicted_Score': round(team2_score)}}

# Change team names and then display predicted winner and loser
score_prediction = predict_score(xgb_model, df, 'Georgia', 'Kennesaw State', '2024-12-30 20:00:00', score_predictors)
score_prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][0] = team2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined['Next Opponent'][1] = team1


{'Team1': {'Name': 'Georgia', 'Predicted_Score': 32},
 'Team2': {'Name': 'Kennesaw State', 'Predicted_Score': 24}}