In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
base_dir = "Data/MLB_2017/"
f = pd.read_csv(base_dir+"MLB_PitchFX_2017_RegularSeason.csv")

In [3]:
def distance(pitch_id, batter_id,balls,strikes, pitch_count, inning, c=10):
    def id_dis(id1, id2):
        return 0 if id1 == id2 else c
    def dis(x):
        sqdis = (x['balls']-balls)**2+\
        (x['strikes']-strikes)**2+\
        (x['pitch_count']-pitch_count)**2+\
        (x['inning']-inning)**2+\
        id_dis(pitch_id,x['pitcher'])+\
        id_dis(batter_id,x['batter'])
        return sqdis
    return dis
def predict(all_data,pitch_id, batter_id,balls,strikes, pitch_count,inning,
            num_of_neighbor=5, c=10):
    f_l=distance(pitch_id,batter_id,balls,strikes,pitch_count,inning,c=c)
    all_data["dis"]=all_data.apply(f_l,axis=1)
    sort_results=all_data.sort_values(by=["dis"])
    select_umpcall = sort_results[0:num_of_neighbor]["umpcall"]
    
    #count weighted states
    X = 0
    S = 0
    B = 0
    i = 0
    for state in select_umpcall:
        tmp = math.exp(-i/num_of_neighbor*3)
        if state == "X":
            X+=tmp
        elif state == "S":
            S+=tmp
        else:
            B+=tmp
        i+=1
    XBS = X+S+B
    print(sort_results[["dis","umpcall"]])
    return {"S": S/XBS,
            "B": B/XBS,
            "X": X/XBS}

In [None]:
f_l=distance("greinza01","spande01",1,1,3,1,c=10)
f["dis"]=f.apply(f_l,axis=1)

In [None]:
select_umpcall = f.sort_values(by=["dis"])[0:10]["umpcall"]
print(f.sort_values(by=["dis"])[0:10][["dis","umpcall"]])

In [None]:
print(predict(f,"holdejo02","sanchga02",1,2,16,7,num_of_neighbor=7, c=10))

# Use Sklearn to improve Performance

In [4]:
from sklearn.neighbors import NearestNeighbors
f.columns.values
label2one = {'B':[1,0,0],'S':[0,1,0],'X':[0,0,1]}
one2label = {0:'B', 1:'S', 2:'X'}

In [5]:
def normalize(data):
    mu = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return mu, std, (data-mu)/std
import random
def dis(x,y):
    return random.randrange(1,20)

In [7]:
# one-hot encoder
from sklearn import preprocessing
pitcher_batter = f[["pitcher","batter"]].fillna("-").as_matrix()
pitcher_label_enc = preprocessing.LabelEncoder()
batter_label_enc = preprocessing.LabelEncoder()
result = np.array([pitcher_label_enc.fit_transform(pitcher_batter[:,0]),
                batter_label_enc.fit_transform(pitcher_batter[:,1])])
enc = preprocessing.OneHotEncoder(sparse = False)
after_encoded = enc.fit_transform(result.transpose())
train_X = np.append(after_encoded,f[["balls","strikes","pitch_count","inning"]],axis=1)
train_y = f['umpcall']
from sklearn.utils import shuffle
data_x, data_y = shuffle(train_X, train_y.as_matrix())

In [None]:
train_x = f[['x0','x','y','ax','ay','az','px','pz','sz_top','sz_bot',
             'vx0','vy0','vz0','pfx_x','z0','start_speed','end_speed',
             'break_y','break_angle','break_length','spin_dir','spin_rate']]
train_y = f['umpcall']

data_x = train_x.as_matrix()
data_y = train_y.as_matrix()
from sklearn.utils import shuffle
data_x, data_y = shuffle(data_x, data_y)

In [8]:
from sklearn.model_selection import train_test_split
train_x, dev_x, train_y, dev_y = train_test_split(data_x, data_y, test_size=0.1, random_state=0)

In [9]:
from sklearn.base import BaseEstimator, ClassifierMixin
class kNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, **kwargs):
        self.model = NearestNeighbors(**kwargs)
    def fit(self, X, y):
#         self.mu, self.std, self.X = normalize(X)
        self.X = X
        self.y = y
        self.model.fit(self.X)
        
    def predict(self, dev_x):
        dist, indx = self.model.kneighbors(dev_x)
        preds = []
        for jj in range(len(indx)):
            dis = {}
            ctr=0
            for i,e in enumerate(indx[jj]):
                ctr += 1/dist[jj][i]**2
                if self.y[e] not in dis:
                    dis[self.y[e]] = 1/dist[jj][i]**2
                else:
                    dis[self.y[e]] += 1/dist[jj][i]**2
            themax = -1
            thekey = []
            for k in dis.keys():
                if dis[k] > themax:
                    themax = dis[k]
                    thekey = k
            preds.append(thekey)
        return preds
    def score(self, X, y=None):
        pred = self.predict(X)
        if y is None:
            return 0
        return np.sum(pred == y) / len(y)

In [10]:
model = kNNClassifier(n_neighbors=20, algorithm='auto', n_jobs=4)

In [11]:
model.fit(train_x, train_y)

In [13]:
model.score(dev_x,dev_y)



0.42654498700282306