In [433]:
# Load raw data
from pandas import read_csv
from datetime import datetime, timedelta
from timeit import default_timer

df_team_bx = read_csv('data/team_boxscore_stats_1997-19.csv')
df_player_bx = read_csv('data/player_boxscore_stats_1997-19_adv.csv')
    

Label: W/L
Features: 

home win/away win
rest days 
win/losee rate for last 10 games
won last game
p1_four_factors*4 (*5)


In [440]:
HOME = 'H'
AWAY = 'A'
REQ_PLAYER_STATS = [
        "PLAYER_ID",
        "PLAYER_NAME",
        "OREB_PCT",
        "DREB_PCT",
        "EFG_PCT",
        "AST_TO",
        "W_PCT",
        "MIN"
]


def features():
    features_type = {
        #'WINNER': str,
        'H_RD': int,  # Rest days
        # 'H_WR': float, # Win ratio for last 10 games
        'H_WLG': bool,  # Won last game
        'H_EFG': float,  # Effective field goal %
        'H_OREB': float,  # Offesnie RB %
        'H_DREB': float,  # Defensive RB %
        'H_AST_TO': float,  # Assit to Turn over ratio
        #'H_W': float,  # win percentage
        'A_RD': int,  # Rest days
        # 'A_WR': float, # Win ratio for last 10 games
        'A_WLG': bool,  # Won last game
        'A_EFG': float,  # Effective field goal %
        'A_OREB': float,  # Offesnie RB %
        'A_DREB': float,  # Defensive RB %
        'A_AST_TO': float,  # Assit to Turn over ratio
        #'A_W': float  # win percentage
    }
    return features_type


def check_court(match):
    return AWAY if match.split()[1] == '@' else HOME

def check_winner(game_inf):
    winner = ''
    court = check_court(game_inf['MATCHUP'])
    if court == HOME:
        winner = HOME if game_inf['WL'] == 'W' else AWAY
    else:
        winner = AWAY if game_inf['WL'] == 'W' else HOME
    
    return winner

def guess_starters(players_bx, game_inf):
    return players_bx[(
        players_bx.GAME_ID == game_inf['GAME_ID']) & (players_bx.TEAM_ID == game_inf['TEAM_ID'])].head(
        10).sort_values(
        by='MIN', ascending=False) #TODO: clean this


def assign_features(sample, game_inf, teams_meta, player_stats):
    court_prefix = check_court(game_inf['MATCHUP']) + "_"
    game_id, team_id = game_inf['GAME_ID'], game_inf['TEAM_ID']
    game_date = datetime.strptime(game_inf['GAME_DATE'][:10], '%Y-%m-%d')

    weights = player_stats['MIN'] / (player_stats['MIN'].sum())
    sample[court_prefix + 'RD'] = (game_date - teams_meta[team_id]['last_game_dt']).days # Rest days
    # sample[court_prefix + 'WR'] =  # Win ratio for last 10 games
    sample[court_prefix + 'WLG'] = teams_meta[team_id]['WLG']
    sample[court_prefix + 'EFG'] = (weights * player_stats['EFG_PCT']).sum()  # Effective field goal %
    sample[court_prefix + 'OREB'] = (weights * player_stats['OREB_PCT']).sum()  # Offesnie RB %
    sample[court_prefix + 'DREB'] = (weights * player_stats['DREB_PCT']).sum()  # Defensive RB %
    sample[court_prefix + 'AST_TO'] = (weights * player_stats['AST_TO']).sum()  # Assit to Turn over ratio
    sample[court_prefix + 'W_PCT'] = (weights * player_stats['W_PCT']).sum()

start_time = default_timer()
x_train = list()
y_train = list()
season_id = None
incomplete_sample = dict() # {'%ID": %i}

for i, game_inf in df_team_bx.iterrows():
    if season_id != game_inf['SEASON_ID']: #TODO: SEASON ID, not accuracy, Playoff is another season only for1999 season
        season_id = game_inf['SEASON_ID']
        teams_meta = dict()  # Keyed by Team ID {'ID': {'last_5_games', 'last_game_dt', 'WLG'}}
        yr = game_inf['GAME_DATE'][:4]
        file_path = 'data/player_yearly_stats/player_stats_' + str(int(yr) - 1)[-2:] + '-' + yr[-2:] + '.csv'
        df_player_stats_hist = read_csv(file_path)
        print('Processing Season' + yr + '...')

    game_id, team_id  = game_inf['GAME_ID'], game_inf['TEAM_ID']
    if not teams_meta.get(team_id):
        teams_meta[team_id] = {
            'last_game_dt': datetime.strptime(game_inf['GAME_DATE'][:10], '%Y-%m-%d'),
            'WLG': True if game_inf['WL'] == 'W' else False
        }
        continue

    #TODO: implement winning streaks23
    #     if len(teams_meta.get(team_id, {}).get(last_5_games, [])) < 5:
    #         team_meta[team_id]['last_5_games'] = team_meta[team_id].get('last_5_games',[]) + []
    #         continue

    if game_id in incomplete_sample:
        sample = x_train[incomplete_sample[game_id]]
        incomplete_sample.pop(game_id)
    else:
        sample = features()
        x_train.append(sample)
        y_train.append(check_winner(game_inf))
        incomplete_sample[game_id] = len(x_train) - 1
    
    starters = guess_starters(df_player_bx, game_inf)
    starters_stats = df_player_stats_hist.merge(starters, on='PLAYER_ID', how='right', suffixes =('', '_BX'))

    #TODO: Assign residual stats, deal with non-existsent stats
    #         if not avg_stats:
    #             # do not overwrite
    #             player_stats = df_player_stats_cur[df_player_stats_cur.PLAYER_ID == player['PLAYER_ID']]

    assign_features(sample, game_inf, teams_meta, starters_stats)
    teams_meta[team_id] = {
        'last_game_dt': datetime.strptime(game_inf['GAME_DATE'][:10], '%Y-%m-%d'),
        'WLG': True if game_inf['WL'] == 'W' else False
    }

print('Cleaning incomplete samples...')
# Clean throw away data
for i in sorted(list(incomplete_sample.values()), reverse=True):
    x_train.pop(i)
    y_train.pop(i)

end_time = default_timer()
print(len(x_train), ' Samples Generated in ', timedelta(seconds=end_time-start_time))

pd.DataFrame(x_train).to_csv('feature_data/x_train_top10players.csv')
pd.DataFrame(y_train).to_csv('feature_data/y_train_top10players.csv')

Processing Season1997...
Processing Season1999...
Processing Season1999...
Processing Season2000...
Processing Season2001...
Processing Season2002...
Processing Season2003...
Processing Season2004...
Processing Season2005...
Processing Season2006...
Processing Season2007...
Processing Season2008...
Processing Season2009...
Processing Season2010...
Processing Season2011...
Processing Season2012...
Processing Season2013...
Processing Season2014...
Processing Season2015...
Processing Season2016...
Processing Season2017...
Processing Season2018...
Cleaning incomplete samples...
25694  Samples Generated in  0:08:12.982610


In [437]:
pd.DataFrame(x_train).to_csv('feature_data/x_train_top7players.csv')
pd.DataFrame(y_train).to_csv('feature_data/y_train_top7players.csv')

{'H_RD': 1,
 'H_WLG': False,
 'H_EFG': 0.48755163727959705,
 'H_OREB': 0.06142317380352645,
 'H_DREB': 0.11754911838790934,
 'H_AST_TO': 1.6630142737195635,
 'A_RD': int,
 'A_WLG': bool,
 'A_EFG': float,
 'A_OREB': float,
 'A_DREB': float,
 'A_AST_TO': float,
 'H_W_PCT': 0.2025247691015953}

In [465]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
#produces a prediction model in the form of an ensemble of weak prediction models, typically decision tree
#import xgboost as xgb
#the outcome (dependent variable) has only a limited number of possible values. 
#Logistic Regression is used when response variable is categorical in nature.
from sklearn.linear_model import LogisticRegression
#A random forest is a meta estimator that fits a number of decision tree classifiers 
#on various sub-samples of the dataset and use averaging to improve the predictive 
#accuracy and control over-fitting.
from sklearn.ensemble import RandomForestClassifier
#a discriminative classifier formally defined by a separating hyperplane.
from sklearn.svm import SVC


# Shuffle and split the dataset into training and testing set.
X_train, X_test, Y_train, y_test = train_test_split(pd.DataFrame(x_train).values, np.array(y_train), 
                                                    test_size = 0.2,
                                                    random_state = 2,
                                                    stratify = np.array(y_train))
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

from sklearn.neighbors import KNeighborsClassifier


# Initialize the three models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')

#Boosting refers to this general problem of producing a very accurate prediction rule 
#by combining rough and moderately inaccurate rules-of-thumb
#clf_C = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, X_train, Y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, Y_train, X_test, y_test)
print('')


Training a LogisticRegression using a training set size of 20555. . .
Trained model in 0.2085 seconds




Made predictions in 0.0149 seconds.
0.7389370853685208 0.651568961323279
F1 score and accuracy score for training set: 0.7389 , 0.6516.
Made predictions in 0.0050 seconds.
F1 score and accuracy score for test set: 0.7358 , 0.6472.

Training a SVC using a training set size of 20555. . .




Trained model in 16.3043 seconds
Made predictions in 9.7055 seconds.
0.7515574214517877 0.6430065677450741
F1 score and accuracy score for training set: 0.7516 , 0.6430.
Made predictions in 2.3816 seconds.
F1 score and accuracy score for test set: 0.7467 , 0.6332.



In [455]:
clf_C = KNeighborsClassifier(n_neighbors =8)
train_predict(clf_C, X_train, Y_train, X_test, y_test)
print('')

Training a KNeighborsClassifier using a training set size of 20555. . .
Trained model in 0.2852 seconds
Made predictions in 1.6027 seconds.
0.740139211136891 0.6948674288494283
F1 score and accuracy score for training set: 0.7401 , 0.6949.
Made predictions in 0.3860 seconds.
F1 score and accuracy score for test set: 0.6423 , 0.5824.



In [452]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf_D = LinearDiscriminantAnalysis()
train_predict(clf_D, X_train, Y_train, X_test, y_test)

Training a LinearDiscriminantAnalysis using a training set size of 20555. . .
Trained model in 0.0559 seconds
Made predictions in 0.0020 seconds.
0.7362593040736258 0.6500608124543906
F1 score and accuracy score for training set: 0.7363 , 0.6501.
Made predictions in 0.0010 seconds.
F1 score and accuracy score for test set: 0.7363 , 0.6497.


In [456]:
from sklearn.ensemble import RandomForestClassifier

clf_E = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
train_predict(clf_E, X_train, Y_train, X_test, y_test)

Training a RandomForestClassifier using a training set size of 20555. . .
Trained model in 0.9783 seconds
Made predictions in 0.1223 seconds.
0.7544375 0.61770858671856
F1 score and accuracy score for training set: 0.7544 , 0.6177.
Made predictions in 0.0364 seconds.
F1 score and accuracy score for test set: 0.7507 , 0.6104.


In [457]:
type(x_train)

list

In [467]:
np.save('test', pd.DataFrame(x_train).values)