In [None]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = None


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, roc_auc_score


In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""

    def __init__(self, columns=[]):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not self.columns:
            return X
        else:
            return X[self.columns]

In [None]:
df = pd.read_csv("../data/game_data_2.csv")

# drop covid games
df = df[df["is_covid_bubble"] == 0].copy()
print(df.shape[0])

# train/test split
# use 2021 and 2022 seasons as hold out
train_raw = df[
    ~df["season_start_year"].isin(
        [
            2019,
            2020,
            2021,
            2022,
        ]
    )
]

test_raw = df[
    df['season_start_year'].isin([2019,])
]

print(train_raw.shape)
print(test_raw.shape)

In [None]:
train_raw['season_start_year'].unique()

In [None]:
# test_raw.season_start_year.unique()

In [None]:
df.columns

In [None]:
keeps = [
    # "Team_ID",
    # "Game_ID",
    # "GAME_DATE",
    # "MATCHUP",
    # "WL",
    # "W",
    # "L",
    # "PTS",
    # "team",
    # "season_start_year",
    # "opponent_abbreviation",
    # "team_abbreviation",
    # "is_covid_bubble",
    # "city_team_key",
    # "abbreviation",
    # "Latitude",
    # "Longitude",
    # "Latitude_opponent",
    # "Longitude_opponent",
    # "PTS_opponent",
    # "point_difference",
    # "game_lat",
    # "game_lon",
    # "prev_game_lon",
    # "prev_game_lat",
    # "distance_from_previous_game",
    # "start_of_streak",
    # "streak_id",
    # "streak_counter",
    # "days_since_1_games_ago",
    # "days_since_2_games_ago",
    # "days_since_3_games_ago",
    # "days_since_4_games_ago",
    # "days_since_5_games_ago",
    # "days_since_6_games_ago",
    # "days_since_7_games_ago",
    # -------------------------
    # "is_last_game_home",
    "incoming_away_game_streak",
    "is_home_game",
    "num_games_last_4_days",
    "num_games_last_7_days",
    "cumulative_distance",
    "incoming_away_game_streak_opponent",
    "days_since_1_games_ago_opponent",
    "num_games_last_4_days_opponent",
    "num_games_last_7_days_opponent",
    "cumulative_distance_opponent",
    # "days_of_rest_difference",
    # "games_last_7_diff",
    # "distance_difference",
    # -------------------------
    # todo ---------------
    # time zone and hours gained/lost
    # time diff with hour
    # num away games in last 7 and in last 5
    #  ---------------
    # "target",
]

id_cols = [
    "Team_ID",
    "Game_ID",
    "GAME_DATE",
    "MATCHUP",
    "WL",
    "W",
    "L",
    "PTS",
    "team",
    "season_start_year",
    "opponent_abbreviation",
    "team_abbreviation",
    "target",
]

In [None]:
test_qlik = df[
    df["season_start_year"].isin(
        [
            2019,
            2020,
            2021,
            2022,
        ]
    )
][id_cols + keeps].copy()
print(test_qlik.shape)
test_qlik.to_csv("../data/qlik_test.csv", index=False)

In [None]:
temp_df = df[
    ~df["season_start_year"].isin(
        [
            2019,
            2020,
            2021,
            2022,
        ]
    )
][id_cols + keeps].copy()
temp_df.to_csv("../data/qlik-test-1.csv", index=False)
print(temp_df.shape)
temp_df

In [None]:

numeric_cols = temp_df[keeps].select_dtypes(include=['number']).columns.tolist()
categorical_cols = temp_df[keeps].select_dtypes(include=['object', 'category']).columns.tolist()
print(f"numeric: {numeric_cols}")
print(f"categorical: {categorical_cols}")

In [None]:
basic_pipe = Pipeline(
    [
        ("column_selector", ColumnSelector(columns=keeps)),
        ("scalar", StandardScaler()),
        # ("model", GradientBoostingClassifier()),
    ]
)

In [None]:
train = basic_pipe.fit_transform(train_raw, train_raw['target'])
print(train.shape)

test = basic_pipe.transform(test_raw)
print(test.shape)

In [None]:
# xgboost
xgb = GradientBoostingClassifier()
xgb.fit(train, train_raw['target'])

xgb_probs = xgb.predict_proba(test)


xgb_preds = xgb.predict(test)
accuracy_score(test_raw['target'], xgb_preds)

In [None]:
lr = LogisticRegression()
lr.fit(train, train_raw['target'])

lr_probs = lr.predict_proba(test)

lr_preds = lr.predict(test)
print(accuracy_score(test_raw['target'], lr_preds))
print(roc_auc_score(test_raw['target'], lr_preds))

In [None]:
lr_probs[:, 1]

In [None]:
pred_df = pd.DataFrame({
    "truth": test_raw['target'].tolist(),
    "lr_probs": lr_probs[:, 1],
    "lr_preds": lr_preds,
    "xgb_probs": xgb_probs[:, 1],
    "xgb_preds": xgb_preds,
})
pred_df

In [None]:
pred_df.describe()

In [None]:
pred_df['is_alert'] = np.where(pred_df['xgb_probs'] >= 0.5, 1, 0)
pred_df['is_alert_right'] = np.where(pred_df['is_alert'] == pred_df['truth'], 1, 0)

print(pred_df['is_alert'].mean())
print(pred_df['is_alert_right'].mean())

pred_df