In [113]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = None


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, roc_auc_score


In [114]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""

    def __init__(self, columns=[]):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not self.columns:
            return X
        else:
            return X[self.columns]

In [115]:
df = pd.read_csv("../data/game_data_2.csv")

# drop covid games
df = df[df["is_covid_bubble"] == 0].copy()
print(df.shape[0])

# train/test split
# use 2021 and 2022 seasons as hold out
train_raw = df[
    ~df["season_start_year"].isin(
        [
            2019,
            2020,
            2021,
            2022,
        ]
    )
]

test_raw = df[
    df['season_start_year'].isin([2019,])
]

print(train_raw.shape)
print(test_raw.shape)

23782
(14760, 51)
(1942, 51)


In [153]:
train_raw['season_start_year'].unique()

array([2013, 2014, 2015, 2016, 2017, 2018])

In [116]:
# test_raw.season_start_year.unique()

In [117]:
df.columns

Index(['Team_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'W', 'L', 'PTS',
       'team', 'season_start_year', 'opponent_abbreviation',
       'team_abbreviation', 'is_home_game', 'is_covid_bubble', 'city_team_key',
       'abbreviation', 'Latitude', 'Longitude', 'Latitude_opponent',
       'Longitude_opponent', 'PTS_opponent', 'point_difference', 'game_lat',
       'game_lon', 'prev_game_lon', 'prev_game_lat',
       'distance_from_previous_game', 'is_last_game_home', 'start_of_streak',
       'streak_id', 'streak_counter', 'incoming_away_game_streak',
       'days_since_1_games_ago', 'days_since_2_games_ago',
       'days_since_3_games_ago', 'days_since_4_games_ago',
       'days_since_5_games_ago', 'days_since_6_games_ago',
       'days_since_7_games_ago', 'num_games_last_4_days',
       'num_games_last_7_days', 'cumulative_distance',
       'incoming_away_game_streak_opponent', 'days_since_1_games_ago_opponent',
       'num_games_last_4_days_opponent', 'num_games_last_7_days_oppon

In [118]:
keeps = [
    # "Team_ID",
    # "Game_ID",
    # "GAME_DATE",
    # "MATCHUP",
    # "WL",
    # "W",
    # "L",
    # "PTS",
    # "team",
    # "season_start_year",
    # "opponent_abbreviation",
    # "team_abbreviation",
    # "is_covid_bubble",
    # "city_team_key",
    # "abbreviation",
    # "Latitude",
    # "Longitude",
    # "Latitude_opponent",
    # "Longitude_opponent",
    # "PTS_opponent",
    # "point_difference",
    # "game_lat",
    # "game_lon",
    # "prev_game_lon",
    # "prev_game_lat",
    # "distance_from_previous_game",
    # "start_of_streak",
    # "streak_id",
    # "streak_counter",
    # "days_since_1_games_ago",
    # "days_since_2_games_ago",
    # "days_since_3_games_ago",
    # "days_since_4_games_ago",
    # "days_since_5_games_ago",
    # "days_since_6_games_ago",
    # "days_since_7_games_ago",
    # -------------------------
    # "is_last_game_home",
    "incoming_away_game_streak",
    "is_home_game",
    "num_games_last_4_days",
    "num_games_last_7_days",
    "cumulative_distance",
    "incoming_away_game_streak_opponent",
    "days_since_1_games_ago_opponent",
    "num_games_last_4_days_opponent",
    "num_games_last_7_days_opponent",
    "cumulative_distance_opponent",
    # "days_of_rest_difference",
    # "games_last_7_diff",
    # "distance_difference",
    # -------------------------
    # todo ---------------
    # time zone and hours gained/lost
    # time diff with hour
    # num away games in last 7 and in last 5
    #  ---------------
    # "target",
]

id_cols = [
    "Team_ID",
    "Game_ID",
    "GAME_DATE",
    "MATCHUP",
    "WL",
    "W",
    "L",
    "PTS",
    "team",
    "season_start_year",
    "opponent_abbreviation",
    "team_abbreviation",
    "target",
]

In [119]:
test_qlik = df[
    df["season_start_year"].isin(
        [
            2019,
            2020,
            2021,
            2022,
        ]
    )
][id_cols + keeps].copy()
print(test_qlik.shape)
test_qlik.to_csv("../data/qlik_test.csv", index=False)

(9022, 23)


In [120]:
temp_df = df[
    ~df["season_start_year"].isin(
        [
            2019,
            2020,
            2021,
            2022,
        ]
    )
][id_cols + keeps].copy()
temp_df.to_csv("../data/qlik-test-1.csv", index=False)
print(temp_df.shape)
temp_df

(14760, 23)


Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,PTS,team,season_start_year,opponent_abbreviation,team_abbreviation,target,incoming_away_game_streak,is_home_game,num_games_last_4_days,num_games_last_7_days,cumulative_distance,incoming_away_game_streak_opponent,days_since_1_games_ago_opponent,num_games_last_4_days_opponent,num_games_last_7_days_opponent,cumulative_distance_opponent
0,1610612737,21300012,2013-10-30,ATL @ DAL,L,0,1,109,Atlanta Hawks,2013,DAL,ATL,1,0,0,0,0,718.172413,0,150.0,0,0,1022.608415
1,1610612737,21300023,2013-11-01,ATL vs. TOR,W,1,1,102,Atlanta Hawks,2013,TOR,ATL,0,0,1,1,1,1436.344826,0,2.0,1,1,732.742992
2,1610612737,21300046,2013-11-03,ATL @ LAL,L,1,2,103,Atlanta Hawks,2013,LAL,ATL,1,0,0,2,2,1929.261873,0,2.0,2,3,0.000000
3,1610612737,21300059,2013-11-05,ATL @ SAC,W,2,2,105,Atlanta Hawks,2013,SAC,ATL,0,1,0,2,3,2289.876831,0,3.0,2,3,137.493424
4,1610612737,21300071,2013-11-07,ATL @ DEN,L,2,3,107,Atlanta Hawks,2013,DEN,ATL,1,2,0,2,3,3173.824432,0,2.0,1,2,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23645,1610612764,21801148,2019-03-31,WAS @ DEN,W,32,46,95,Washington Wizards,2018,DEN,WAS,0,3,0,2,3,3523.062553,0,2.0,2,4,1795.966979
23646,1610612764,21801167,2019-04-03,WAS vs. CHI,L,32,47,114,Washington Wizards,2018,CHI,WAS,1,0,1,1,3,5010.716180,1,2.0,2,3,916.924370
23647,1610612764,21801182,2019-04-05,WAS vs. SAS,L,32,48,112,Washington Wizards,2018,SAS,WAS,1,0,1,1,3,0.000000,1,2.0,2,3,2290.436031
23648,1610612764,21801204,2019-04-07,WAS @ NYK,L,32,49,110,Washington Wizards,2018,NYK,WAS,1,0,0,2,3,204.971889,0,2.0,2,3,3201.048353


In [121]:

numeric_cols = temp_df[keeps].select_dtypes(include=['number']).columns.tolist()
categorical_cols = temp_df[keeps].select_dtypes(include=['object', 'category']).columns.tolist()
print(f"numeric: {numeric_cols}")
print(f"categorical: {categorical_cols}")

numeric: ['incoming_away_game_streak', 'is_home_game', 'num_games_last_4_days', 'num_games_last_7_days', 'cumulative_distance', 'incoming_away_game_streak_opponent', 'days_since_1_games_ago_opponent', 'num_games_last_4_days_opponent', 'num_games_last_7_days_opponent', 'cumulative_distance_opponent']
categorical: []


In [122]:
basic_pipe = Pipeline(
    [
        ("column_selector", ColumnSelector(columns=keeps)),
        ("scalar", StandardScaler()),
        # ("model", GradientBoostingClassifier()),
    ]
)

In [123]:
train = basic_pipe.fit_transform(train_raw, train_raw['target'])
print(train.shape)

test = basic_pipe.transform(test_raw)
print(test.shape)

(14760, 10)
(1942, 10)


In [127]:
# xgboost
xgb = GradientBoostingClassifier()
xgb.fit(train, train_raw['target'])

xgb_probs = xgb.predict_proba(test)


xgb_preds = xgb.predict(test)
accuracy_score(test_raw['target'], xgb_preds)

0.548918640576725

In [125]:
lr = LogisticRegression()
lr.fit(train, train_raw['target'])

lr_probs = lr.predict_proba(test)

lr_preds = lr.predict(test)
print(accuracy_score(test_raw['target'], lr_preds))
print(roc_auc_score(test_raw['target'], lr_preds))

0.5509783728115345
0.5509783728115345


In [138]:
lr_probs[:, 1]

array([0.5670312 , 0.41300805, 0.41447621, ..., 0.42192117, 0.44632737,
       0.44046478])

In [139]:
pred_df = pd.DataFrame({
    "truth": test_raw['target'].tolist(),
    "lr_probs": lr_probs[:, 1],
    "lr_preds": lr_preds,
    "xgb_probs": xgb_probs[:, 1],
    "xgb_preds": xgb_preds,
})
pred_df

Unnamed: 0,truth,lr_probs,lr_preds,xgb_probs,xgb_preds
0,0,0.567031,1,0.654385,1
1,0,0.413008,0,0.407830,0
2,1,0.414476,0,0.429865,0
3,1,0.592311,1,0.603997,1
4,1,0.438253,0,0.448839,0
...,...,...,...,...,...
1937,1,0.549071,1,0.576663,1
1938,1,0.610918,1,0.546733,1
1939,0,0.421921,0,0.448612,0
1940,1,0.446327,0,0.448628,0


In [140]:
pred_df.describe()

Unnamed: 0,truth,lr_probs,lr_preds,xgb_probs,xgb_preds
count,1942.0,1942.0,1942.0,1942.0,1942.0
mean,0.5,0.500038,0.5,0.501027,0.49794
std,0.500129,0.085263,0.500129,0.0881,0.500125
min,0.0,0.353438,0.0,0.26554,0.0
25%,0.0,0.416713,0.0,0.42566,0.0
50%,0.5,0.500002,0.5,0.498204,0.0
75%,1.0,0.583137,1.0,0.581612,1.0
max,1.0,0.647292,1.0,0.699821,1.0


In [152]:
pred_df['is_alert'] = np.where(pred_df['xgb_probs'] >= 0.5, 1, 0)
pred_df['is_alert_right'] = np.where(pred_df['is_alert'] == pred_df['truth'], 1, 0)

print(pred_df['is_alert'].mean())
print(pred_df['is_alert_right'].mean())

pred_df

0.4979402677651905
0.548918640576725


Unnamed: 0,truth,lr_probs,lr_preds,xgb_probs,xgb_preds,is_alert,is_alert_right
0,0,0.567031,1,0.654385,1,1,0
1,0,0.413008,0,0.407830,0,0,1
2,1,0.414476,0,0.429865,0,0,0
3,1,0.592311,1,0.603997,1,1,1
4,1,0.438253,0,0.448839,0,0,0
...,...,...,...,...,...,...,...
1937,1,0.549071,1,0.576663,1,1,1
1938,1,0.610918,1,0.546733,1,1,1
1939,0,0.421921,0,0.448612,0,0,1
1940,1,0.446327,0,0.448628,0,0,0
