# Simple analysis: compare deviations for different pairs

In [62]:
%matplotlib inline
import itertools
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Deviation for games in the same season between the same teams 

In [5]:
df.columns.values

array(['Unnamed: 0', 'AST', 'AST_PCT', 'AST_RATIO', 'AST_TOV', 'CFGA',
       'CFGM', 'CFG_PCT', 'DEF_RATING', 'DFGA', 'DFGM', 'DFG_PCT', 'DIST',
       'DRBC', 'DREB_PCT', 'EFG_PCT', 'FG_PCT', 'FTAST', 'FT_AST',
       'FT_BLK', 'FT_Best_PTS_Player', 'FT_DREB', 'FT_FG2A', 'FT_FG2M',
       'FT_FG3A', 'FT_FG3M', 'FT_FTA', 'FT_FTM', 'FT_OREB', 'FT_PF',
       'FT_PTS', 'FT_STL', 'FT_Second_Best_PTS_Player', 'FT_TO', 'HT_AST',
       'HT_BLK', 'HT_Best_PTS_Player', 'HT_DREB', 'HT_FG2A', 'HT_FG2M',
       'HT_FG3A', 'HT_FG3M', 'HT_FTA', 'HT_FTM', 'HT_OREB', 'HT_PF',
       'HT_PTS', 'HT_STL', 'HT_Second_Best_PTS_Player', 'HT_TO',
       'NET_RATING', 'OFF_RATING', 'ORBC', 'OREB_PCT', 'PACE', 'PASS',
       'PIE', 'RBC', 'REB_PCT', 'SAST', 'TCHS', 'TM_TOV_PCT', 'TS_PCT',
       'UFGA', 'UFGM', 'UFG_PCT', 'USG_PCT', 'GAME_ID', 'HOME', 'tot',
       'TEAM', 'AST_opp', 'AST_PCT_opp', 'AST_RATIO_opp', 'AST_TOV_opp',
       'CFGA_opp', 'CFGM_opp', 'CFG_PCT_opp', 'DEF_RATING_opp',
       'DFGA_o

In [8]:
sc = StandardScaler()
df = pd.read_csv('../../teams.csv')

df['season'] = (df.GAME_ID/100000).map(int)
df = df.loc[df.season<217]

col_groupby = ['TEAM','TEAM_opp','season']
# df.groupby(col_groupby)[['FT_PTS','FT_PTS_opp']].describe() 
df.groupby(col_groupby)['FT_PTS'].std().describe()

count    3472.000000
mean        0.753891
std         0.464447
min         0.000000
25%         0.421763
50%         0.697248
75%         1.036611
max         3.072843
Name: FT_PTS, dtype: float64

# Average deviation accros all games per season

In [3]:
df.groupby('season').FT_PTS.std()

season
213    11.524956
214    11.372010
215    11.400446
216    11.859953
Name: FT_PTS, dtype: float64

# Average deviation accros all games per season for the same team and all opponents


In [4]:
col_groupby = ['TEAM','season']
# df.groupby(col_groupby)[['FT_PTS','FT_PTS_opp']].describe() 
df.groupby(col_groupby)['FT_PTS'].std().describe()

count    120.000000
mean      10.814463
std        1.050968
min        8.514976
25%       10.067460
50%       10.742682
75%       11.432637
max       14.031394
Name: FT_PTS, dtype: float64

# Build syamease network that will learn to recognise similar pairs

In [9]:
cols = df.columns.difference(['TEAM','TEAM_opp','season','HOME','tot','Unnamed: 0'])
df[cols] = sc.fit_transform(df[cols])

In [104]:
from keras.layers import Input, Conv2D, Lambda, merge, Dense, Flatten,MaxPooling2D, Dot, add, Dropout, BatchNormalization
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.models import load_model

In [102]:
def gen_model():
    input_shape = (133,)
    p = 0.1
    left_input = Input(input_shape)
    right_input = Input(input_shape)

    encoder = Sequential()
    encoder.add(Dense(128))
    encoder.add(BatchNormalization())
    encoder.add(Dropout(p))
    encoder.add(Dense(64))
    encoder.add(BatchNormalization())
    encoder.add(Dropout(p))
    encoder.add(Dense(16))

    encoded_l = encoder(left_input)
    encoded_r = encoder(right_input)

    cosine = Dot(1, normalize=True)([encoded_l, encoded_r])

    prediction = Dense(1,activation='sigmoid')(cosine) # add this line won't give me any error
    siamese_net = Model(inputs=[left_input,right_input],outputs=prediction)
    optimizer = Adam(lr = 0.0001)
    siamese_net.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=["accuracy"])
    
    return siamese_net

# Build train set

In [112]:
col_groupby = ['TEAM','TEAM_opp','season']
grouped = df.groupby(col_groupby)
left = []
right = []
label = []
df_out = []
for info,g_ in tqdm(grouped):
    indexes = list(range(g_.shape[0]))
    team = info[0]
    team_opp = info[1]
    season = info[2]
    this_teams = [team, team_opp]
    if len(indexes)<2:
        continue
    
    g = g_.copy().reset_index(drop=True)
    combinations = all_possible_subsets(indexes)
    n_comb = len(combinations)    
    df_counter_examples_right = df.loc[((-df.TEAM.isin(this_teams)) | (-df.TEAM_opp.isin(this_teams))) & (df.season==season)].sample(n_comb)
    df_counter_examples_left = g.sample(n_comb, replace=True).reset_index(drop=True)
    
    counter = 0
    for pair in combinations:
        left.append(g.iloc[pair[0]][cols])
        right.append(g.iloc[pair[1]][cols])
        label.append(1)
        df_out.append({'team_left': team, 'team_opp_left':team_opp,'team_right': team,
                       'team_opp_right':team_opp,'season':season, 'label':1 })
        #### False examples ####
        left.append(df_counter_examples_left.iloc[counter][cols])
        right.append(df_counter_examples_right.iloc[counter][cols])
        label.append(0)
        df_out.append({'team_left': team, 'team_opp_left':team_opp,'team_right': df_counter_examples_right.iloc[counter].TEAM,
                       'team_opp_right':df_counter_examples_right.iloc[counter].TEAM_opp,'season':season, 'label':0 })
        #### False examples ####
        
        counter += 1


  0%|          | 0/3480 [00:00<?, ?it/s][A
  0%|          | 4/3480 [00:00<01:38, 35.11it/s][A
  0%|          | 9/3480 [00:00<01:33, 37.10it/s][A
  0%|          | 13/3480 [00:00<01:34, 36.84it/s][A
  1%|          | 18/3480 [00:00<01:30, 38.36it/s][A
  1%|          | 27/3480 [00:00<01:15, 45.76it/s][A
  1%|          | 32/3480 [00:00<01:15, 45.44it/s][A
  1%|          | 42/3480 [00:00<01:04, 53.72it/s][A
  1%|▏         | 50/3480 [00:00<00:58, 58.84it/s][A
  2%|▏         | 58/3480 [00:01<00:54, 62.54it/s][A
  2%|▏         | 65/3480 [00:01<01:01, 55.83it/s][A
  2%|▏         | 74/3480 [00:01<00:55, 61.80it/s][A
  2%|▏         | 81/3480 [00:01<00:55, 61.01it/s][A
  3%|▎         | 88/3480 [00:01<01:04, 52.62it/s][A
  3%|▎         | 99/3480 [00:01<00:54, 61.72it/s][A
  3%|▎         | 107/3480 [00:01<00:55, 60.70it/s][A
  3%|▎         | 114/3480 [00:01<00:56, 59.82it/s][A
  3%|▎         | 121/3480 [00:02<01:05, 51.37it/s][A
  4%|▎         | 127/3480 [00:02<01:12, 46.17it/s][A

 33%|███▎      | 1154/3480 [00:19<00:44, 51.96it/s][A
 33%|███▎      | 1162/3480 [00:19<00:39, 57.96it/s][A
 34%|███▎      | 1174/3480 [00:19<00:33, 67.93it/s][A
 34%|███▍      | 1185/3480 [00:20<00:30, 74.07it/s][A
 34%|███▍      | 1194/3480 [00:20<00:34, 66.00it/s][A
 35%|███▍      | 1202/3480 [00:20<00:34, 66.17it/s][A
 35%|███▍      | 1210/3480 [00:20<00:37, 61.11it/s][A
 35%|███▍      | 1217/3480 [00:20<00:41, 55.11it/s][A
 35%|███▌      | 1226/3480 [00:20<00:37, 60.66it/s][A
 35%|███▌      | 1233/3480 [00:20<00:41, 54.48it/s][A
 36%|███▌      | 1240/3480 [00:21<00:40, 55.03it/s][A
 36%|███▌      | 1250/3480 [00:21<00:36, 61.26it/s][A
 36%|███▌      | 1257/3480 [00:21<00:41, 53.77it/s][A
 36%|███▋      | 1263/3480 [00:21<00:46, 47.22it/s][A
 36%|███▋      | 1270/3480 [00:21<00:43, 50.57it/s][A
 37%|███▋      | 1277/3480 [00:21<00:41, 53.11it/s][A
 37%|███▋      | 1283/3480 [00:21<00:44, 49.29it/s][A
 37%|███▋      | 1289/3480 [00:22<00:48, 45.18it/s][A
 37%|███▋ 

 68%|██████▊   | 2363/3480 [00:39<00:18, 59.28it/s][A
 68%|██████▊   | 2370/3480 [00:39<00:18, 61.02it/s][A
 68%|██████▊   | 2377/3480 [00:39<00:21, 50.48it/s][A
 68%|██████▊   | 2383/3480 [00:39<00:21, 49.95it/s][A
 69%|██████▊   | 2390/3480 [00:40<00:34, 31.53it/s][A
 69%|██████▉   | 2395/3480 [00:40<00:31, 34.39it/s][A
 69%|██████▉   | 2405/3480 [00:40<00:25, 42.60it/s][A
 69%|██████▉   | 2412/3480 [00:40<00:23, 46.10it/s][A
 69%|██████▉   | 2418/3480 [00:40<00:25, 41.25it/s][A
 70%|██████▉   | 2424/3480 [00:40<00:25, 42.04it/s][A
 70%|██████▉   | 2431/3480 [00:40<00:22, 45.78it/s][A
 70%|███████   | 2438/3480 [00:40<00:21, 48.32it/s][A
 70%|███████   | 2444/3480 [00:41<00:22, 45.10it/s][A
 70%|███████   | 2449/3480 [00:41<00:24, 42.13it/s][A
 71%|███████   | 2454/3480 [00:41<00:25, 40.51it/s][A
 71%|███████   | 2459/3480 [00:41<00:25, 39.58it/s][A
 71%|███████   | 2469/3480 [00:41<00:21, 48.10it/s][A
 71%|███████   | 2475/3480 [00:41<00:19, 50.95it/s][A
 71%|█████

In [113]:
siamese_net = gen_model()

batch_size = 32
siamese_net.fit(x=[left,right],y=label,batch_size=batch_size)
optimizer = Adam(lr = 0.01)
siamese_net.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=["accuracy"])
siamese_net.fit(x=[left,right],y=label,batch_size=batch_size, validation_split=0.1, shuffle=True, epochs=50)



Epoch 1/1
Train on 18745 samples, validate on 2083 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a782a09b0>

In [114]:
siamese_net.save('model_v100.h5')
model2 = load_model('model_v100.h5')

In [116]:
model2.layers.pop()

<keras.layers.core.Dense at 0x1a79cd1ba8>

In [117]:
similarity = model2.predict(x=[right,left])

In [121]:
data = pd.DataFrame.from_records(df_out)
data['similarity'] = similarity

In [125]:
data.head(10)

Unnamed: 0,label,season,team_left,team_opp_left,team_opp_right,team_right,similarity
0,1,213,ATL,BKN,BKN,ATL,0.759822
1,0,213,ATL,BKN,CHA,IND,0.011165
2,1,213,ATL,BKN,BKN,ATL,0.821474
3,0,213,ATL,BKN,CHA,PHX,0.493688
4,1,213,ATL,BKN,BKN,ATL,0.829702
5,0,213,ATL,BKN,LAL,IND,0.008566
6,1,214,ATL,BKN,BKN,ATL,0.778614
7,0,214,ATL,BKN,SAS,CHI,0.649286
8,1,214,ATL,BKN,BKN,ATL,0.808475
9,0,214,ATL,BKN,SAS,DEN,0.031215


In [129]:
data.groupby(['label','season'])['similarity'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
label,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,213,2608.0,0.296434,0.250103,0.002015,0.073675,0.225352,0.483442,0.921685
0,214,2588.0,0.298269,0.25252,0.00207,0.078281,0.221881,0.477545,0.912189
0,215,2610.0,0.300345,0.252284,0.001705,0.080428,0.224459,0.497075,0.915914
0,216,2608.0,0.309449,0.253474,0.002757,0.085129,0.238996,0.511256,0.91458
1,213,2608.0,0.661781,0.210839,0.002881,0.542567,0.72418,0.828764,0.923495
1,214,2588.0,0.659972,0.21023,0.010993,0.539719,0.723177,0.825363,0.925506
1,215,2610.0,0.655938,0.216235,0.015748,0.528806,0.722864,0.830092,0.923847
1,216,2608.0,0.650326,0.221194,0.011357,0.511043,0.715977,0.834046,0.926462


In [128]:
data.loc[(data.similarity>data.similarity.quantile(0.95)) & (data.label!=1)]

Unnamed: 0,label,season,team_left,team_opp_left,team_opp_right,team_right,similarity
109,0,216,ATL,CHA,NOP,LAL,0.894548
187,0,216,ATL,CLE,BOS,POR,0.885740
283,0,214,ATL,IND,ATL,BOS,0.897215
353,0,214,ATL,MIA,GSW,ATL,0.887917
441,0,213,ATL,NYK,GSW,MIL,0.900070
461,0,216,ATL,NYK,SAS,LAL,0.892202
1811,0,213,BOS,NYK,BKN,ORL,0.900684
2761,0,215,CHA,WAS,NOP,CHA,0.892969
3019,0,214,CHI,DET,NOP,SAS,0.908155
3139,0,213,CHI,MIA,NOP,MEM,0.903475
