In [None]:
!pip install tslearn
!pip install pyts

In [197]:
import numpy as np
import pandas as pd

In [7]:
agg_df = pd.read_csv('../data_ready/agg/batting_norm_agg.csv')

In [198]:
agg_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

Unnamed: 0,ab,bb,double,g,h,hbp,hr,player_id,r,rbi,sb,sh,so,triple,years_played,hof
0,-0.661588,-0.438871,-0.640879,-0.705525,-0.583966,-0.539028,-0.388342,acostme01,-0.539369,-0.609924,-0.335788,-0.665060,-0.794309,-0.310270,1.0,False
1,-1.145500,-0.634319,-1.124076,-1.026475,-1.017650,-1.125153,-0.813317,acostme01,-0.903027,-1.153566,-0.636014,-1.424980,-0.892177,-0.404662,2.0,False
2,-1.161406,-0.031041,-1.378979,-0.693510,-1.160762,-0.056680,-1.235447,acostme01,-0.879855,-1.079926,-0.389503,-1.058649,-1.112142,-0.762685,3.0,False
3,-1.954610,-0.615781,-2.052017,-1.647343,-1.893377,-0.604941,-1.638442,acostme01,-1.575987,-1.775687,-0.956969,-1.764229,-2.108656,-1.364642,4.0,False
4,-1.744661,-0.229190,-2.314481,-1.438181,-1.531243,-1.148788,-2.011442,acostme01,-1.190280,-1.700775,-0.966303,-1.027140,-2.202136,-0.952990,5.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43686,-0.448198,-0.350041,-0.506754,-0.542378,-0.443520,-0.452528,-0.364299,turnetr01,-0.407728,-0.547344,0.027295,-0.478008,-0.390894,-0.427949,1.0,False
43687,0.781068,0.410928,0.186854,0.614127,0.573732,0.309030,0.325219,urshegi01,0.382690,0.258928,-0.357982,0.058954,0.748672,0.186402,1.0,False
43688,-0.659393,-0.567460,-0.605841,-1.099213,-0.623035,-0.452528,-0.502202,waldrky02,-0.605333,-0.587657,-0.357982,-0.478008,-0.663400,-0.427949,1.0,False
43689,-0.551088,-0.513105,-0.308580,-0.949296,-0.503358,-0.452528,-0.364299,willima07,-0.486770,-0.466716,-0.357982,-0.478008,-0.613853,-0.427949,1.0,False


In [9]:
career_length = agg_df.groupby('player_id')['years_played'].max()
players_5yrs = career_length[career_length > 5]

# Classifying the time series samples

### Approaches:
* Zero pad, and throw into one time series classifier  
* Train one time series classifier for each length (1-year careers, 2-year careers, etc.)



In [216]:
X_all = np.load('../data_ready/ts/X_all.npy')
y_all = np.load('../data_ready/y_all.npy')
years_played_all = np.load('../data_ready/years_played_all.npy')
player_ids_all = np.load('../data_ready/player_ids_all.npy', allow_pickle=True)

### train / test split by HOF / non-hof players

In [218]:
hof_players = agg_df[agg_df['hof']]['player_id'].unique()
non_hof_players = agg_df[~agg_df['hof']]['player_id'].unique()
np.random.seed(1)
np.random.shuffle(hof_players), np.random.shuffle(non_hof_players);

In [219]:
train_ratio = 0.7
zeros_ratio = 1
n_hof, n_non = len(hof_players), len(non_hof_players)

train_hof, test_hof = hof_players[:int(n_hof*train_ratio)], hof_players[int(n_hof*train_ratio):]
train_non, test_non = non_hof_players[:int(n_non*train_ratio)], non_hof_players[int(n_non*train_ratio):]

train_non_sample = train_non[:int(len(train_non)*zeros_ratio)]

train_players, test_players = set(np.concatenate((train_hof, train_non_sample))), set(np.concatenate((test_hof, test_non)))
train_idxs = np.array([i for i in range(len(player_ids_all)) if player_ids_all[i] in train_players])
test_idxs = np.array([i for i in range(len(player_ids_all)) if player_ids_all[i] in test_players])

In [220]:
X_train, X_test = X_all[train_idxs], X_all[test_idxs]
y_train, y_test = y_all[train_idxs], y_all[test_idxs]
train_years_played, test_years_played = years_played_all[train_idxs], years_played_all[test_idxs]
train_player_ids, test_player_ids = player_ids_all[train_idxs], player_ids_all[test_idxs]

## Undo zero-padding + group idxs by # of samples

In [222]:
X_train_unpad, X_test_unpad = [], []
idx_train_by_year, idx_test_by_year = [[] for _ in range(26)], [[] for _ in range(26)]
for i in range(X_train.shape[0]):
    yrs_played = int(train_years_played[i])
    idx_train_by_year[yrs_played].append(i)
    X_train_unpad.append(X_train[i][:yrs_played])
for i in range(X_test.shape[0]):
    yrs_played = int(test_years_played[i])
    idx_test_by_year[yrs_played].append(i)
    X_test_unpad.append(X_test[i][:yrs_played])

### % of HOFers + # of samples for each career length

In [223]:
print('training data')
train_hist = [(np.sum(y_train[idx_train_by_year[i]]) / len(idx_train_by_year[i]), len(idx_train_by_year[i])) for i in range(1, 26)]
display(train_hist)
print('test data')
test_hist = [(np.sum(y_test[idx_test_by_year[i]]) / len(idx_test_by_year[i]), len(idx_test_by_year[i])) for i in range(1, 26)]
test_hist

training data


[(0.01791166293507022, 4913),
 (0.023441662226957913, 3754),
 (0.02861788617886179, 3075),
 (0.033820138355111454, 2602),
 (0.03951504265828469, 2227),
 (0.0446927374301676, 1969),
 (0.05086705202312139, 1730),
 (0.05778069599474721, 1523),
 (0.06722689075630252, 1309),
 (0.07963800904977375, 1105),
 (0.09209100758396534, 923),
 (0.11413043478260869, 736),
 (0.14513274336283186, 565),
 (0.16591928251121077, 446),
 (0.20057306590257878, 349),
 (0.252, 250),
 (0.3191489361702128, 188),
 (0.425, 120),
 (0.5443037974683544, 79),
 (0.7021276595744681, 47),
 (0.78125, 32),
 (0.8571428571428571, 21),
 (0.8, 10),
 (0.75, 4),
 (1.0, 1)]

test data


  test_hist = [(np.sum(y_test[idx_test_by_year[i]]) / len(idx_test_by_year[i]), len(idx_test_by_year[i])) for i in range(1, 26)]


[(0.018095238095238095, 2100),
 (0.023125, 1600),
 (0.028136882129277566, 1315),
 (0.03268551236749117, 1132),
 (0.03707414829659319, 998),
 (0.041666666666666664, 888),
 (0.04798962386511025, 771),
 (0.0549777117384844, 673),
 (0.060810810810810814, 592),
 (0.0728744939271255, 494),
 (0.08333333333333333, 420),
 (0.10174418604651163, 344),
 (0.12546125461254612, 271),
 (0.1588785046728972, 214),
 (0.19642857142857142, 168),
 (0.22962962962962963, 135),
 (0.25742574257425743, 101),
 (0.30434782608695654, 69),
 (0.37209302325581395, 43),
 (0.4444444444444444, 27),
 (0.5384615384615384, 13),
 (0.4, 10),
 (0.3333333333333333, 3),
 (0.0, 1),
 (nan, 0)]

## Random HOF and non-HOF samples:


In [224]:
np.random.seed(1)
hof_rand, non_hof_rand = np.random.choice(np.where(y_train == 1)[0]), np.random.choice(np.where(y_train == 0)[0])

In [226]:
for i in (hof_rand, non_hof_rand):
    display(X_train_unpad[i], train_years_played[i], train_player_ids[i])

array([[ 1.1397146 ,  2.02168166,  1.67783737,  1.08165126,  0.79330746,
         0.4307195 ,  1.43247401,  1.64431773,  0.89903381,  3.21099742,
         0.01175137,  2.02244887,  0.97868075],
       [ 1.83910241,  1.49446174,  2.36819163,  1.85646077,  1.6838071 ,
         0.92022933,  2.13481438,  2.35548097,  1.15177433,  2.77268543,
        -0.66056693,  1.58710637,  3.81924303]])

2.0

'bondsba01'

array([[-0.57857528, -0.59401745, -0.6408787 , -0.56246791, -0.60335458,
        -0.53902771,  0.0925411 , -0.42471865, -0.56422656, -0.3357883 ,
        -0.66506004, -0.3957359 , -0.57427077]])

1.0

'thorpji01'

# Models

### ROCKET 
hinge: train in 70 sec, 90 epochs  
log: train in 150 sec, 101 epochs





Trying 70/30 splits because 80/20 gives too few 'full career' test samples of HOFers

In [227]:
import numpy as np
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier

from sktime.transformers.series_as_features.rocket import Rocket

In [228]:
def eval_rocket(rocket_pipeline, X_test, y_test):
    ones_preds, zeros_preds, ones_score, zeros_score = None, None, None, None
    n_ones = np.sum(y_test)
    n_zeros = np.sum(1-y_test)
    try:
#         print('ones ({})'.format(np.sum(y_test)))
        ones_preds = rocket_pipeline.predict_proba(X_test[y_test == 1])
        ones_score = np.sum(np.argmax(ones_preds, axis=1)) / len(ones_preds)
#         print(ones_score)
    except:
        pass
    try:
#         print('zeros ({})'.format(np.sum(1-y_test)))
        zeros_preds = rocket_pipeline.predict_proba(X_test[y_test == 0])
        zeros_score = np.sum(1-np.argmax(zeros_preds, axis=1)) / len(zeros_preds)
#         print(zeros_score)
    except:
        pass
    
    return ones_preds, zeros_preds, (ones_score, n_ones), (zeros_score, n_zeros)

In [229]:
def eval_rocket_by_year(rocket_pipeline, X_test, y_test, agg_df, test_player_ids, test_years_played, test_zeros_ratio=1, years_before_end=-1):
    
    if years_before_end != -1:
        player_career_lengths = agg_df.groupby('player_id')['years_played'].max().to_dict()
        test_idxs_full_careers = [i for i in range(len(X_test)) if player_career_lengths[test_player_ids[i]] - years_before_end == test_years_played[i]]

        X_test_full_careers, y_test_full_careers = X_test[test_idxs_full_careers], y_test[test_idxs_full_careers]
        
        test_0s = np.where(y_test_full_careers==0)[0]
        np.random.seed(1)
        idxs_sample_0s = np.random.choice(test_0s, size=int(len(test_0s) * test_zeros_ratio), replace=False)
        
        idxs = np.concatenate((idxs_sample_0s, np.where(y_test_full_careers==1)[0]))

        return eval_rocket(rocket_pipeline, X_test_full_careers[idxs], y_test_full_careers[idxs])
    else:
        return eval_rocket(rocket_pipeline, X_test, y_test)
        

In [230]:
rocket_pipeline_sgd = make_pipeline(Rocket(), SGDClassifier(loss='log', eta0=0.001, learning_rate='adaptive', verbose=1, ))

In [231]:
rocket_pipeline_sgd.fit(X_train, y_train)

-- Epoch 1
Norm: 10.71, NNZs: 20000, Bias: -0.003125, T: 27978, Avg. loss: 4.955015
Total training time: 1.25 seconds.
-- Epoch 2
Norm: 14.55, NNZs: 20000, Bias: -0.001824, T: 55956, Avg. loss: 2.492977
Total training time: 2.48 seconds.
-- Epoch 3
Norm: 17.11, NNZs: 20000, Bias: -0.003396, T: 83934, Avg. loss: 1.658427
Total training time: 3.72 seconds.
-- Epoch 4
Norm: 18.90, NNZs: 20000, Bias: -0.002649, T: 111912, Avg. loss: 1.307963
Total training time: 4.97 seconds.
-- Epoch 5
Norm: 20.35, NNZs: 20000, Bias: -0.003238, T: 139890, Avg. loss: 1.047719
Total training time: 6.22 seconds.
-- Epoch 6
Norm: 21.49, NNZs: 20000, Bias: -0.004017, T: 167868, Avg. loss: 0.879933
Total training time: 7.46 seconds.
-- Epoch 7
Norm: 22.29, NNZs: 20000, Bias: -0.003272, T: 195846, Avg. loss: 0.736593
Total training time: 8.70 seconds.
-- Epoch 8
Norm: 22.94, NNZs: 20000, Bias: -0.003158, T: 223824, Avg. loss: 0.629732
Total training time: 9.93 seconds.
-- Epoch 9
Norm: 23.64, NNZs: 20000, Bias: 

Pipeline(steps=[('rocket', Rocket()),
                ('sgdclassifier',
                 SGDClassifier(eta0=0.001, learning_rate='adaptive', loss='log',
                               verbose=1))])

In [232]:
_, _, ones_score, zeros_score = eval_rocket_by_year(rocket_pipeline_sgd, X_test, y_test, agg_df, test_player_ids, test_years_played, years_before_end=0)
ones_score, zeros_score

((0.2894736842105263, 38), (0.986905916585839, 2062))

In [233]:
ones_preds, zeros_preds, ones_scores, zeros_scores = [], [], [], []
for i in range(26):
    ones_pred, zeros_pred, ones_score, zeros_score = eval_rocket_by_year(rocket_pipeline_sgd, X_test, y_test, agg_df, test_player_ids, test_years_played, years_before_end=i)
    ones_scores.append(ones_score), zeros_scores.append(zeros_score)
    ones_preds.append(ones_pred), zeros_preds.append(zeros_pred)

In [None]:
[x.max(axis=1).mean(axis=0) for x in ones_preds if x is not None]

In [234]:
ones_scores, zeros_scores

([(0.2894736842105263, 38),
  (0.16216216216216217, 37),
  (0.16216216216216217, 37),
  (0.1891891891891892, 37),
  (0.13513513513513514, 37),
  (0.16216216216216217, 37),
  (0.16216216216216217, 37),
  (0.21621621621621623, 37),
  (0.19444444444444445, 36),
  (0.16666666666666666, 36),
  (0.14285714285714285, 35),
  (0.11428571428571428, 35),
  (0.08823529411764706, 34),
  (0.08823529411764706, 34),
  (0.06060606060606061, 33),
  (0.0967741935483871, 31),
  (0.0, 26),
  (0.047619047619047616, 21),
  (0.0, 16),
  (0.0, 12),
  (0.14285714285714285, 7),
  (0.0, 4),
  (0.0, 1),
  (None, 0),
  (None, 0),
  (None, 0)],
 [(0.986905916585839, 2062),
  (0.9865642994241842, 1563),
  (0.9843505477308294, 1278),
  (0.9808219178082191, 1095),
  (0.9729448491155047, 961),
  (0.9647473560517039, 851),
  (0.9754768392370572, 734),
  (0.9732704402515723, 636),
  (0.9730215827338129, 556),
  (0.9737991266375546, 458),
  (0.9636363636363636, 385),
  (0.970873786407767, 309),
  (0.9746835443037974, 237),

In [187]:
ones_scores, zeros_scores

([(0.2894736842105263, 38),
  (0.21621621621621623, 37),
  (0.21621621621621623, 37),
  (0.21621621621621623, 37),
  (0.13513513513513514, 37),
  (0.13513513513513514, 37),
  (0.16216216216216217, 37),
  (0.16216216216216217, 37),
  (0.1111111111111111, 36),
  (0.1111111111111111, 36),
  (0.08571428571428572, 35),
  (0.05714285714285714, 35),
  (0.08823529411764706, 34),
  (0.08823529411764706, 34),
  (0.0, 33),
  (0.03225806451612903, 31),
  (0.0, 26),
  (0.19047619047619047, 21),
  (0.0, 16),
  (0.08333333333333333, 12),
  (0.14285714285714285, 7),
  (0.0, 4),
  (0.0, 1),
  (None, 0),
  (None, 0),
  (None, 0)],
 [(0.9742967992240543, 2062),
  (0.9833653230966091, 1563),
  (0.9780907668231612, 1278),
  (0.9780821917808219, 1095),
  (0.9802289281997919, 961),
  (0.972972972972973, 851),
  (0.9754768392370572, 734),
  (0.9685534591194969, 636),
  (0.960431654676259, 556),
  (0.9716157205240175, 458),
  (0.9558441558441558, 385),
  (0.948220064724919, 309),
  (0.9535864978902954, 237),
 

In [43]:
rocket_pipeline_lr = make_pipeline(Rocket(), LogisticRegression())

In [44]:
rocket_pipeline_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('rocket', Rocket()),
                ('logisticregression', LogisticRegression())])

In [46]:
ones_preds_lr, zeros_preds_lr, ones_scores_lr, zeros_scores_lr = [], [], [], []
for i in range(26):
    ones_pred, zeros_pred, ones_score, zeros_score = eval_rocket_by_year(rocket_pipeline_lr, X_test, y_test, agg_df, test_player_ids, test_years_played, years_before_end=i)
    ones_scores_lr.append(ones_score), zeros_scores_lr.append(zeros_score)
    ones_preds_lr.append(ones_pred), zeros_preds_lr.append(zeros_pred)

In [58]:
[x.max(axis=1).mean(axis=0) for x in ones_preds_lr if x is not None]

[0.9728662059137236,
 0.9929342601925164,
 0.99346416387623,
 0.9967988207994575,
 0.9909905834569106,
 0.9765439904964117,
 0.9949735651804014,
 0.9598247171690968,
 0.9768766756414441,
 0.9469910682985063,
 0.9434483565245709,
 0.9637201689063353,
 0.8862348063154448,
 0.9023997168849658,
 0.9327165148198022,
 0.8996094527926582,
 0.9511596148214464,
 0.9165886934148301,
 0.9456244967745924,
 0.9451336230178454,
 0.9314653422498586,
 0.8932411574535032,
 0.9950144202879145]

In [47]:
ones_scores_lr, zeros_scores_lr

([(0.9444444444444444, 36),
  (0.9761904761904762, 42),
  (0.967741935483871, 31),
  (0.9761904761904762, 42),
  (1.0, 40),
  (1.0, 45),
  (1.0, 38),
  (0.9333333333333333, 30),
  (0.9777777777777777, 45),
  (0.9210526315789473, 38),
  (0.9142857142857143, 35),
  (0.8529411764705882, 34),
  (0.6666666666666666, 21),
  (0.7105263157894737, 38),
  (0.6206896551724138, 29),
  (0.6071428571428571, 28),
  (0.4230769230769231, 26),
  (0.4166666666666667, 24),
  (0.29411764705882354, 17),
  (0.14285714285714285, 14),
  (0.2222222222222222, 9),
  (0.5, 8),
  (0.0, 2),
  (None, 0),
  (None, 0),
  (None, 0)],
 [(0.9985822306238186, 2116),
  (0.9955974842767296, 1590),
  (0.9960063897763578, 1252),
  (0.9953401677539608, 1073),
  (0.997920997920998, 962),
  (0.9939172749391727, 822),
  (0.9945130315500685, 729),
  (0.9950166112956811, 602),
  (0.9881188118811881, 505),
  (0.9909706546275395, 443),
  (0.9886039886039886, 351),
  (0.9965034965034965, 286),
  (0.9950980392156863, 204),
  (0.99425287

### LSTM

In [None]:
pip install tensorflow

In [235]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import to_categorical

In [269]:
def train_model(trainX, trainy):
    verbose, epochs, batch_size = 1, 20, 64
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1]
    model = Sequential()
    model.add(LSTM(100, input_shape=(n_timesteps,n_features)))
    model.add(LSTM(100, input_shape=(100,)))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit network
    model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, verbose=verbose, class_weight={0:0.056, 1:1.0})
    return model

In [263]:
def eval_lstm(lstm, X_test, y_test):
    ones_preds, zeros_preds, ones_score, zeros_score = None, None, None, None
    n_ones = np.sum(y_test)
    n_zeros = np.sum(1-y_test)
    try:
#         print('ones ({})'.format(np.sum(y_test)))
        ones_preds = lstm.predict(X_test[y_test == 1], batch_size=64, verbose=0)

        ones_score = np.sum(np.argmax(ones_preds, axis=1)) / len(ones_preds)
#         print(ones_score)
    except:
        pass
    try:
#         print('zeros ({})'.format(np.sum(1-y_test)))
        zeros_preds = lstm.predict(X_test[y_test == 0], batch_size=64, verbose=0)
        zeros_score = np.sum(1-np.argmax(zeros_preds, axis=1)) / len(zeros_preds)
#         print(zeros_score)
    except:
        pass
    
    return ones_preds, zeros_preds, (ones_score, n_ones), (zeros_score, n_zeros)

In [238]:
def eval_lstm_by_year(lstm, X_test, y_test, agg_df, test_player_ids, test_years_played, test_zeros_ratio=1, years_before_end=-1):
    
    if years_before_end != -1:
        player_career_lengths = agg_df.groupby('player_id')['years_played'].max().to_dict()
        test_idxs_full_careers = [i for i in range(len(X_test)) if player_career_lengths[test_player_ids[i]] - years_before_end == test_years_played[i]]

        X_test_full_careers, y_test_full_careers = X_test[test_idxs_full_careers], y_test[test_idxs_full_careers]
        
        test_0s = np.where(y_test_full_careers==0)[0]
        np.random.seed(1)
        idxs_sample_0s = np.random.choice(test_0s, size=int(len(test_0s) * test_zeros_ratio), replace=False)
        
        idxs = np.concatenate((idxs_sample_0s, np.where(y_test_full_careers==1)[0]))

        return eval_lstm(lstm, X_test_full_careers[idxs], y_test_full_careers[idxs])
    else:
        return eval_rocket(lstm, X_test, y_test)
        

In [239]:
sum(y_train) / len(y_train)

0.05647294302666381

In [270]:
lstm = train_model(X_train, to_categorical(y_train))

ValueError: Input 0 of layer lstm_16 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 100]

In [242]:
ones_preds_lstm, zeros_preds_lstm, ones_scores_lstm, zeros_scores_lstm = [], [], [], []
for i in range(26):
    ones_pred, zeros_pred, ones_score, zeros_score = eval_lstm_by_year(lstm, X_test, y_test, agg_df, test_player_ids, test_years_played, years_before_end=i)
    ones_scores_lstm.append(ones_score), zeros_scores_lstm.append(zeros_score)
    ones_preds_lstm.append(ones_pred), zeros_preds_lstm.append(zeros_pred)

In [154]:
X_babe_ruth = X_all[np.where(player_ids_all == 'ruthba01')]
lstm.predict(X_babe_ruth)

array([[8.2354128e-01, 1.7645872e-01],
       [1.5211278e-01, 8.4788722e-01],
       [9.5069297e-02, 9.0493071e-01],
       [5.3681238e-03, 9.9463189e-01],
       [2.4270816e-03, 9.9757296e-01],
       [1.6701411e-03, 9.9832982e-01],
       [2.0846217e-03, 9.9791533e-01],
       [1.8395528e-03, 9.9816042e-01],
       [1.3342579e-03, 9.9866569e-01],
       [1.2906342e-03, 9.9870932e-01],
       [1.1719392e-03, 9.9882811e-01],
       [1.0368337e-03, 9.9896312e-01],
       [1.4069162e-03, 9.9859303e-01],
       [2.0288890e-03, 9.9797112e-01],
       [1.6506033e-03, 9.9834943e-01],
       [1.1700179e-03, 9.9883002e-01],
       [7.1390590e-04, 9.9928612e-01],
       [6.4166368e-04, 9.9935836e-01],
       [2.9888004e-01, 7.0111996e-01],
       [1.2681215e-02, 9.8731881e-01],
       [2.4252464e-03, 9.9757475e-01],
       [8.7468891e-04, 9.9912530e-01]], dtype=float32)

In [243]:
ones_scores_lstm, zeros_scores_lstm

([(0.42105263157894735, 38),
  (0.4594594594594595, 37),
  (0.4864864864864865, 37),
  (0.4864864864864865, 37),
  (0.4864864864864865, 37),
  (0.4594594594594595, 37),
  (0.43243243243243246, 37),
  (0.40540540540540543, 37),
  (0.3611111111111111, 36),
  (0.3333333333333333, 36),
  (0.2857142857142857, 35),
  (0.2857142857142857, 35),
  (0.29411764705882354, 34),
  (0.35294117647058826, 34),
  (0.3333333333333333, 33),
  (0.25806451612903225, 31),
  (0.2692307692307692, 26),
  (0.2857142857142857, 21),
  (0.3125, 16),
  (0.25, 12),
  (0.2857142857142857, 7),
  (0.0, 4),
  (0.0, 1),
  (None, 0),
  (None, 0),
  (None, 0)],
 [(0.9917555771096024, 2062),
  (0.9846449136276392, 1563),
  (0.9780907668231612, 1278),
  (0.9707762557077626, 1095),
  (0.9656607700312175, 961),
  (0.9529964747356052, 851),
  (0.9400544959128065, 734),
  (0.9449685534591195, 636),
  (0.9406474820143885, 556),
  (0.9344978165938864, 458),
  (0.9246753246753247, 385),
  (0.9223300970873787, 309),
  (0.915611814345

In [89]:
[x.max(axis=1).mean(axis=0) for x in ones_preds_lstm if x is not None]

[0.99811906,
 0.9997349,
 0.99912983,
 0.999557,
 0.9969489,
 0.9991829,
 0.9981535,
 0.989467,
 0.9898754,
 0.9688486,
 0.9715613,
 0.942761,
 0.9331082,
 0.92390203,
 0.9329607,
 0.9088898,
 0.897143,
 0.94207376,
 0.90502095,
 0.9022516,
 0.9083192,
 0.7924831,
 0.9861785]

### KNN

In [10]:
from tslearn.utils import to_time_series_dataset
from tslearn.neighbors import KNeighborsTimeSeriesClassifier 

In [142]:
def eval_knn(knn, X_test, y_test):
    ones_preds, zeros_preds = None, None
    try:
        ones_preds = knn.predict([X_test[i] for i in np.where(y_test == 1)[0]])
        print(np.sum(ones_preds) / len(ones_preds))
    except:
        pass
    try:
        zeros_preds = knn.predict([X_test[i] for i in np.where(y_test == 0)[0]])
        print(np.sum(1-zeros_preds) / len(zeros_preds))
    except:
        pass
    
    return ones_preds, zeros_preds

### 1 big knn (0-padded)

#### eval on players at end of career
on 10% of test zeros:
10% of train 0s: 100% 1s, 94% 0s  
50% of train 0s: ??% 1s, 97% 0s  
75% of train 0s: ??% 1s, ??% 0s  

In [193]:
def eval_end_career_knn(X_train, y_train, X_test, y_test, agg_df, train_player_ids, test_player_ids, train_years_played, test_years_played, train_zeros_ratio=1, test_zeros_ratio=1, end_careers=True):
    if train_zeros_ratio != 1:
        np.random.seed(1)
        idxs_train_0s = np.where(y_train == 0)[0]
        idxs_train_1s = np.where(y_train == 1)[0]
        ratio_0s = np.random.choice(idxs_train_0s, size=int(len(idxs_train_0s) * train_zeros_ratio), replace=False)
        X_train_ratio_0s = [X_train[i] for i in ratio_0s] + [X_train[i] for i in idxs_train_1s]
        y_train_ratio_0s = [y_train[i] for i in ratio_0s] + [y_train[i] for i in idxs_train_1s]
    else:
        X_train_ratio_0s = X_train
        y_train_ratio_0s = y_train

    knn_all = KNeighborsTimeSeriesClassifier(n_neighbors=1)
    knn_all.fit(X_train_ratio_0s, y_train_ratio_0s)
    
    if end_careers:
        player_career_lengths = agg_df.groupby('player_id')['years_played'].max().to_dict()
        test_idxs_full_careers = [i for i in range(len(X_test)) if player_career_lengths[test_player_ids[i]] == test_years_played[i]]
        display(len(test_idxs_full_careers) / len(X_test))

        X_test_full_careers, y_test_full_careers = X_test[test_idxs_full_careers], y_test[test_idxs_full_careers]
        
        
        
        test_0s = np.where(y_test_full_careers==0)[0]
        np.random.seed(1)
        idxs_sample_0s = np.random.choice(test_0s, size=int(len(test_0s) * test_zeros_ratio))
        
        idxs = np.concatenate((idxs_sample_0s, np.where(y_test_full_careers==1)[0]))
#         print(idxs_sample_0s)
        print(idxs)
        eval_knn(knn_all, X_test_full_careers[idxs], y_test_full_careers[idxs])
    else:
        eval_knn(knn_all, X_test, y_test)

In [195]:
eval_end_career_knn(X_train, y_train, X_test, y_test, agg_df, train_player_ids, test_player_ids, train_years_played, test_years_played, train_zeros_ratio=0.25, test_zeros_ratio=0.1, end_careers=True)

0.17418235877106045

[1077  241 1112  920  726  860  975  148  131  760  518 1322 1219 1317
 1295  364  929  477  922  259  678  406  572  590 1256 1017  764 1126
  145 1047  326  842 1354  523  323  215 1305  739  637  439  643  465
  552 1111 1354  525  979  804  506 1083 1066 1377  631  897 1338   15
 1104  202   25 1323 1062 1015 1167 1320 1397  159  948  320  605 1207
  658 1329 1320  762 1127  250 1389  596 1231 1343  943 1192  705 1362
  910 1294  470  574  478  153  726  270   79  723 1115 1373 1083  542
  885  979  579  216   98  276 1050  286  862  903  156  424 1181 1234
  616  498  156 1379  897  878  132  625  273  498  479 1239  717  334
  181 1055  915  174   20  640  285  494  636  619 1050 1136   64   74
  133  144  180  184  245  398  450  511  706  778  837  881  916  990
 1148]
1.0
0.9710144927536232


## knn on specific year of careers

In [69]:
def eval_year_knn(yr, X_train, y_train, X_test, y_test, idx_train_by_year, idx_test_by_year, zeros_ratio=1):
    idxs_train_yr, idxs_test_yr = np.array(idx_train_by_year[yr]), np.array(idx_test_by_year[yr])
    
    X_train_yr = [X_train_unpad[i] for i in idx_train_by_year[yr]]
    y_train_yr = y_train[idx_train_by_year[yr]]
    X_test_yr = [X_test_unpad[i] for i in idx_test_by_year[yr]]
    y_test_yr = y_test[idx_test_by_year[yr]]
    
    if zeros_ratio != 1:
        np.random.seed(1)
        idxs_yr_train_0s = np.where(y_train_yr == 0)[0]
        idxs_yr_train_1s = np.where(y_train_yr == 1)[0]
        ratio_0s = np.random.choice(idxs_yr_train_0s, size=int(len(idxs_yr_train_0s) * zeros_ratio), replace=False)
        X_train_yr_ratio_0s = [X_train_yr[i] for i in ratio_0s] + [X_train_yr[i] for i in idxs_yr_train_1s]
        y_train_yr_ratio_0s = [y_train_yr[i] for i in ratio_0s] + [y_train_yr[i] for i in idxs_yr_train_1s]
    else:
        X_train_yr_ratio_0s = X_train_yr
        y_train_yr_ratio_0s = y_train_yr
    
    display(np.where(y_test_yr == 1)[0])
    
    knn_yr = KNeighborsTimeSeriesClassifier(n_neighbors=1)
    knn_yr.fit(X_train_yr_ratio_0s, y_train_yr_ratio_0s)
    
    eval_knn(knn_yr, X_test_yr, y_test_yr)

In [84]:
eval_year_knn(20, X_train, y_train, X_test, y_test, idx_train_by_year, idx_test_by_year, zeros_ratio=1)

array([ 1,  2,  4,  5,  7,  8,  9, 10, 11, 13])

0.6 0.75
