# Predict Cy Young Winners

In [0]:
import requests
import pandas as pd
import numpy as np
import lxml.html as lh
import tensorflow as tf
import keras

**Define a function to scrape URL from MLB.com**

In [0]:
def scrape_cy(url):
    # create a handle to handle the contents of the website
    page = requests.get(url)
    # obtain the content
    doc = lh.fromstring(page.content)
    # parse data between <tr> </tr>
    tr_elements = doc.xpath('//tr')
    # define header
    header = []
    for x in tr_elements[0].iterchildren():
        header.append(x.text_content())
    # obtain table content
    tab = []
    for i in range(1, len(tr_elements)):
        out = []
        for x in tr_elements[i].iterchildren():
            out.append(x.text_content())
        tab.append(out)
    return pd.DataFrame(tab, columns = header)

### Process Cy Young Winners

In [0]:
url_al = "http://m.mlb.com/awards/history-winners/?award_id=ALCY"
url_nl = "http://m.mlb.com/awards/history-winners/?award_id=NLCY"

In [0]:
al_cy = scrape_cy(url_al)
nl_cy = scrape_cy(url_nl)

In [0]:
all_cy = al_cy.append(nl_cy).drop(['Team', 'Position'], axis = 1).rename(columns = {'Year': 'Season', 'Player': 'Name'},)
all_cy['winner'] = 1

**Define a function to scrape Fangraphs data**
(Based on pybaseball code)

In [0]:
from bs4 import BeautifulSoup
def scrape_fangraphs(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "lxml")
    table = soup.find('table', {'class': 'rgMasterTable'})
    
    data = []
    # pull headings
    headings = []
    headingrows = table.find_all('th')
    for row in headingrows[1:]:
        headings.append(row.text.strip())
    data.append(headings)
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [x.text.strip() for x in cols]
        data.append([x for x in cols[1:]])
    
    data = pd.DataFrame(data)
    data = data.rename(columns = data.iloc[0])
    data = data.reindex(data.index.drop(0))
    # replace empty strings with NaN
    data.replace(r'^\s*$', np.nan, regex=True, inplace = True)

    # convert all percent strings to proper percetages
    percentages = [headings[i] for i in np.where([x.find('%') > -1 for x in headings])[0]]
    if len(percentages) > 0:
        for col in percentages:
            # skip if column is all NA (happens for some of the more obscure stats + in older seasons)
            if not data[col].empty:
                if pd.api.types.is_string_dtype(data[col]):
                    data[col] = data[col].astype(str).str.strip(' %')
                    data[col] = data[col].astype(str).str.strip('%')
                    data[col] = [float(x)/100 if x != 'None' else np.nan for x in data[col]]
                else:
                    pass

    #convert everything except name and team to numeric
    cols_to_numeric = [col for col in data.columns if col not in ['Name', 'Team', 'Season']]
    data[cols_to_numeric] = data[cols_to_numeric].astype(float)
    return data

### Process Fangraphs Data

In [0]:
url = 'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=160&type=0&season=2019&month=0&season1=2009&ind=1&team=0&rost=0&age=0&filter=&players=0&startdate=1956-01-01&enddate=2019-12-31&page=1_100000'
standard = scrape_fangraphs(url)

In [0]:
url = 'https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=160&type=1&season=2019&month=0&season1=2009&ind=1&team=0&rost=0&age=0&filter=&players=0&startdate=1956-01-01&enddate=2019-12-31&page=1_100000'
advanced = scrape_fangraphs(url)

In [0]:
cols = [x for x in advanced.columns if x not in standard.columns or x in ['Season', 'Name', 'Team']]

In [0]:
all_stats = pd.merge(standard, advanced[cols], on = ['Season', 'Name', 'Team'], how = 'left')

### Create Full Data

In [0]:
data = pd.merge(all_stats, all_cy, on = ['Season', 'Name'], how = 'left')
data['winner'].fillna(0, inplace = True)
# data.to_csv("/Users/ming-senwang/Dropbox/my-git/predict-cy/full-data.csv", index = False)

**Run the code from below if offline**

In [0]:
# data = pd.read_csv("/Users/ming-senwang/Dropbox/my-git/predict-cy/full-data.csv")

**Feature Engineering**
Create contextual features by season

Difference with Season Average

In [0]:
context = data[['Season', 'W', 'L', 'ERA', 'G', 'GS', 'CG', 'ShO', 'SV',
       'IP', 'TBF', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'HBP',
       'WP', 'BK', 'SO', 'K/9', 'BB/9', 'K/BB', 'HR/9', 'K%', 'BB%', 'K-BB%',
       'AVG', 'WHIP', 'BABIP', 'LOB%',]].groupby('Season').apply(lambda x: x - np.mean(x))
context.columns = [x + "_demean" for x in context.columns]

In [0]:
data = pd.concat([data, context], axis = 1)
data['Season'] = [int(x) for x in data['Season']]
data = data[data['Season'] >= 2009]

In [108]:
data[data['winner'] == 1]['IP'].min()

180.2

In [109]:
np.sqrt(data[data['winner'] == 1]['IP'].var())

18.53238985355668

Filter data by Innings Pitched.

In [0]:
data = data[data['IP']>= 160]

**Split Prediction and Development Data**

In [0]:
df = data[data['Season'] != 2019]
pred_df = data[data['Season'] == 2019].drop(['winner'], axis = 1)

In [0]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_x = scaler.fit_transform(df.drop(['Season', 'Name', 'Team', 'winner'], axis = 1).values)
train_y = df['winner']

### Create Model Constructor

In [0]:
from keras import models, layers
from keras.backend import clear_session, set_session

def model_constructor(n, p, layer):
    clear_session()
    model = models.Sequential()
    for i in range(layer):
        model.add(layers.Dense(n, activation = 'relu', input_shape = (train_x.shape[1], )))
        model.add(layers.Dropout(p))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [0]:
season = df['Season']
season_ix = pd.unique(df['Season'])

### Test model_constructor()

In [115]:
clear_session()
model = models.Sequential()
model.add(layers.Dense(2, activation = 'relu', input_shape = (train_x.shape[1], )))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2)                 144       
_________________________________________________________________
dropout_1 (Dropout)          (None, 2)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 3         
Total params: 147
Trainable params: 147
Non-trainable params: 0
_________________________________________________________________


In [116]:
model = model_constructor(2, 0.2, 1)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2)                 144       
_________________________________________________________________
dropout_1 (Dropout)          (None, 2)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 3         
Total params: 147
Trainable params: 147
Non-trainable params: 0
_________________________________________________________________


In [0]:
import altair as alt    
#alt.renderers.enable('notebook')

# Cross Validation to Choose the Parameters

In [0]:
from sklearn.metrics import f1_score, precision_score, recall_score
def model_cv(n, p, layer):
    out = []
    for six in season_ix:
        x_train = train_x[season != six]
        y_train = train_y[season != six]

        x_test = train_x[season == six]
        y_test = train_y[season == six]
    
        model = model_constructor(n, p, layer)

        model.fit(x_train, y_train,
                  epochs = 20,
                  batch_size = 8,
                  verbose = 0)
    
        scores = model.predict(x_test)
        # the top 2 scores are the winner
        ix = np.argsort([-1 * x for x in scores.T])
        y_pred = [1 if x <= 5 else 0 for x in ix[0]]
        
        # scores above 0.01 are the winners 
        # y_pred = [1 if x > 0.01 else 0 for x in scores]
        out.append(precision_score(y_test, y_pred))
    return(out)

In [119]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [120]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0



['/job:localhost/replica:0/task:0/device:GPU:0']

In [121]:
K.set_session(sess)
out = []
for i in [8, 16, 64, 128,]:
    for d in [0, 0.2, 0.4]:
        for l in [1, 2, 3, 4]:
            print([i, d, l])
            out.append([i, d, l, np.mean(model_cv(i, d, l))]) 

[8, 0, 1]
[8, 0, 2]
[8, 0, 3]
[8, 0, 4]
[8, 0.2, 1]
[8, 0.2, 2]
[8, 0.2, 3]
[8, 0.2, 4]
[8, 0.4, 1]
[8, 0.4, 2]
[8, 0.4, 3]
[8, 0.4, 4]
[16, 0, 1]
[16, 0, 2]
[16, 0, 3]
[16, 0, 4]
[16, 0.2, 1]
[16, 0.2, 2]
[16, 0.2, 3]
[16, 0.2, 4]
[16, 0.4, 1]
[16, 0.4, 2]
[16, 0.4, 3]
[16, 0.4, 4]
[64, 0, 1]
[64, 0, 2]
[64, 0, 3]
[64, 0, 4]
[64, 0.2, 1]
[64, 0.2, 2]
[64, 0.2, 3]
[64, 0.2, 4]
[64, 0.4, 1]
[64, 0.4, 2]
[64, 0.4, 3]
[64, 0.4, 4]
[128, 0, 1]
[128, 0, 2]
[128, 0, 3]
[128, 0, 4]
[128, 0.2, 1]
[128, 0.2, 2]
[128, 0.2, 3]
[128, 0.2, 4]
[128, 0.4, 1]
[128, 0.4, 2]
[128, 0.4, 3]
[128, 0.4, 4]


In [122]:
pd.DataFrame(out, columns = ['knots', 'dropout', 'layers', 'precision']).sort_values('precision', ascending = False)

Unnamed: 0,knots,dropout,layers,precision
25,64,0.0,2,0.266667
3,8,0.0,4,0.233333
33,64,0.4,2,0.233333
41,128,0.2,2,0.233333
27,64,0.0,4,0.233333
20,16,0.4,1,0.233333
32,64,0.4,1,0.216667
30,64,0.2,3,0.216667
24,64,0.0,1,0.216667
17,16,0.2,2,0.216667


### 2019 Prediction

In [123]:
model = models.Sequential()
model.add(layers.Dense(128, activation = 'relu', input_shape = (train_x.shape[1], )))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(128, activation = 'relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(128, activation = 'relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(128, activation = 'relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
model.fit(train_x, train_y,
          epochs = 20,
          batch_size = 8,)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f7636743eb8>

In [124]:
from keras.backend import clear_session
x_pred = scaler.transform(pred_df.drop(['Season', 'Name', 'Team', ], axis = 1).values)
rep = []
for i in range(40):
    print(i)
    clear_session()
    model = models.Sequential()
    model.add(layers.Dense(128, activation = 'relu', input_shape = (train_x.shape[1], )))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(128, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(128, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(128, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    model.fit(train_x, train_y,
          epochs = 20,
          batch_size = 8,
          verbose = 0)
    rep.append(model.predict(x_pred).T.tolist()[0])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [0]:
scores = pd.DataFrame(rep).mean().tolist()

In [126]:
src = pd.concat(
    [pred_df['Name'].reset_index(drop = True), pd.Series(scores) * 100], axis=1
).rename(columns = {
    'Name': 'Players',
    0: 'Scores'
}).sort_values('Scores', ascending = False)

src[src['Scores'] >= 1]

Unnamed: 0,Players,Scores
3,Justin Verlander,35.151007
2,Gerrit Cole,24.350374
1,Jacob deGrom,21.79558
5,Jack Flaherty,2.310925
15,Stephen Strasburg,1.981445
18,Lucas Giolito,1.720283
8,Zack Greinke,1.388279
