In [7]:
import numpy as np
import pandas as pd
from scipy import stats
from warnings import filterwarnings

filterwarnings('ignore')

In [8]:
from api import DataAPI
api = DataAPI()

'''
Function: loads game statistics for a specified player and calculates trailing window statistics for specific features.
It also constructs a feature set X and a target y based on a specified target column and rolling windows. 
The function then filters out columns with excessive missing values and short datasets.
 
Returns: features X, target y, and the latest feature values
'''

def load_player_data(player, target, windows=[1, 3, 5, 10, 20], nan_thresh=.9, data_thresh=100):
    try:
        player_df = api.get_player_data(player)
    except:
        return None, None, None
    player_df = player_df.reset_index(drop=False)
    target_col = f'{target}_shift'

    feature_df = pd.DataFrame()
    for window in windows:
        for feature in player_df.columns:
            if feature in ['date', 'opponent', 'team']:
                continue
            feature_df[f'trailing_{window}_game_{feature}_mean'] = player_df[feature].rolling(window).mean()
            feature_df[f'trailing_{window}_game_{feature}_std']  = player_df[feature].rolling(window).std()
            feature_df[f'trailing_{window}_game_{feature}_min']  = player_df[feature].rolling(window).min()
            feature_df[f'trailing_{window}_game_{feature}_max']  = player_df[feature].rolling(window).max()

    feature_df = feature_df.dropna(thresh=int(len(feature_df) * nan_thresh), axis=1)
    latest_features = feature_df.iloc[-1]

    feature_df[target_col]= player_df[target].shift(max(windows))
    feature_df = feature_df.dropna()

    X = feature_df.drop(target_col, axis=1)
    y = feature_df[target_col]

    if len(X) < data_thresh:
        return None, None, None

    return X, y, latest_features


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
'''
Function: Standardizes the features X, creates a binary target y_class based on a threshold value
Returns: best estimator using grid search and the scaler.
'''
def train_player_model(X, y, value, model, param_grid, n_splits=10, test_size=5):
    tscv = TimeSeriesSplit(n_splits=n_splits, gap=0, max_train_size=None, test_size=test_size)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y_class = y > value
    grid = GridSearchCV(model, param_grid=param_grid, cv=tscv)
    grid.fit(X, y_class)
    return grid.best_estimator_, scaler

In [10]:
'''
I set up hyperparameter grids for LogisticRegression and RandomForestClassifier
Function: predict_player predicts the probability of a player's performance exceeding a specified threshold (line). 
The function utilizes the load_player_data function to generate features and targets,
and then trains the models using train_player_model with cross-validation and grid search. 
Returns: results of both models 
'''

logistic = LogisticRegression()
logistic_param_grid = { 
    'C': [0.001, 0.01, 0.1],
    'penalty': ['l2', 'l1'],
    'solver': ['liblinear']
}

random_forest = RandomForestClassifier()
random_forest_param_grid = { 
    'n_estimators': [200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5],
    'criterion' :['gini', 'entropy']
}


def predict_player(player, type, line):
    X, y, latest = load_player_data(player, type)

    if X is None:
        return None, None

    logistic_model, scaler = train_player_model(X, y, line, logistic, logistic_param_grid)
    random_forest_model, scaler = train_player_model(X, y, line, random_forest, random_forest_param_grid)

    latest = scaler.transform(latest.values.reshape(1, -1))
    logistic_prediction = logistic_model.predict_proba(latest)
    random_forest_prediction = random_forest_model.predict_proba(latest)

    return logistic_prediction[0], random_forest_prediction[0]


In [11]:
# converts betting odds into implied probabilities 
def get_implied_prob(odds):
    if odds < 0:
        return abs(odds) / (abs(odds) + 100)
    else:
        return 100 / (odds + 100)
    
# calculates the expected value (EV) of a bet based on given odds and probability.
def get_EV(odds, prob):
    if odds < 0:
        return prob * (100 / (abs(odds))) - (1 - prob)
    else:
        return prob * (odds / 100) - (1 - prob)

In [12]:
import json
'''
Function: loads player betting lines from a JSON file for a specified date and type and calculates probabilities for whether 
the player will score over or under the given line. 
The probabilities are used to calculate implied probabilities and expected values (EVs) for betting on either side. 
Results: The results are stored in a list of dictionaries, (converted to a df and csv)
'''
date = '2023-11-10'
type = 'points'
line_type = 'pts'

with open(f'../realtime/dates/{date}/{type}.json', 'r') as f:
    lines = json.load(f)

bet_df = []

for player in list(lines.keys()):
    line = lines[player]['line']
    over_odds = lines[player]['over_odds']
    under_odds = lines[player]['under_odds']
    player = player.lower()
    try:
        log_proba, rf_proba = predict_player(player, line_type, line)
        if log_proba is None:
            continue
    except:
        continue
    log_under_proba, log_over_proba = log_proba
    rf_under_proba, rf_over_proba = rf_proba

    over_implied_prob = get_implied_prob(over_odds)
    under_implied_prob = get_implied_prob(under_odds)

    over_EV = get_EV(over_odds, log_over_proba)
    under_EV = get_EV(under_odds, log_under_proba)

    print(player)

    bet_df.append({
        'player': player,
        'market': type,
        'line': line,
        'over_odds': over_odds,
        'under_odds': under_odds,
        'over_implied_prob': over_implied_prob,
        'under_implied_prob': under_implied_prob,
        'over_EV': over_EV,
        'under_EV': under_EV,
        'log_over_proba': log_over_proba,
        'log_under_proba': log_under_proba,
        'rf_over_proba': rf_over_proba,
        'rf_under_proba': rf_under_proba,
    })

bet_df = pd.DataFrame(bet_df)
bet_df = bet_df.to_csv('bet_df.csv', index=False)


de'anthony melton
isaiah stewart
joel embiid
kelly oubre jr.
killian hayes
tobias harris
tyrese maxey
corey kispert
daniel gafford
danilo gallinari
deni avdija
gordon hayward
jordan poole
kyle kuzma
lamelo ball
nick richards
p.j. washington
theo maledon
tyus jones
derrick white
dorian finney-smith
jaylen brown
jayson tatum
jrue holiday
kristaps porzingis
mikal bridges
royce o'neale
sam hauser
spencer dinwiddie
anthony edwards
jaden mcdaniels
karl-anthony towns
keldon johnson
kyle anderson
mike conley
naz reid
rudy gobert
collin sexton
desmond bane
jaren jackson jr.
john collins
jordan clarkson
lauri markkanen
marcus smart
ziaire williams
alperen sengun
brandon ingram
dillon brooks
herbert jones
jalen green
jonas valanciunas
grant williams
ivica zubac
james harden
kawhi leonard
kyrie irving
luka doncic
norman powell
paul george
russell westbrook
tim hardaway jr.
anthony davis
austin reaves
bradley beal
christian wood
d'angelo russell
eric gordon
grayson allen
jusuf nurkic
keita bates-di