In [None]:
import pandas as pd
import datarobot as dr
import mlb_pull_year as mlb
import requests
import os
from pprint import pprint 

API_TOKEN = os.getenv('DATAROBOT_API_TOKEN')
ENDPOINT = os.getenv('DATAROBOT_ENDPOINT')
USERNAME = 'matthew.cohen@datarobot.com'
dr.Client(endpoint=ENDPOINT, token=API_TOKEN)

# Get the baseball project
PROJECT_ID = '5bdb7caa7c6f8b71e0428016'
project = dr.Project.get(PROJECT_ID)

# ...or create it anew
def create_baseball():
    # Read source data
    pitches_train = pd.read_csv('pitch_scoring.csv', parse_dates=['date'])
    print('Source data shape:', pitches_train.shape)
    # pitches_train.head()
    
    # Create the project in the DataRobot Cloud
    print('Creating project')
    project = dr.Project.create(sourcedata=pitches_train, project_name='Baseball pitch prediction')
    
    # Set target starts autopilog
    print('Running autopilot')
    project.set_target(target='strike', mode='auto', worker_count=20)
    
    # Block until complete
    print('Waiting to complete')
    project.wait_for_autopilot()
    
    print('Done.')
    return project

project = create_baseball()

Source data shape: (1090935, 33)
Creating project
Setting target


In [4]:
def get_pitches_day(year, month, day):
    # 
    # Get the pitch data for a day
    #
    pitches = mlb.read_yearmonth(year, month, day)  # omits the 'strike' feature
    print('\nNum pitches:', len(pitches))
    
    # 
    # Edit the columns for the received day's pitches to match the training data columns 
    #
    # Get features from the daily pitch data
    pitches_today = pd.DataFrame(pitches)
    all_pitch_cols = pitches_today.columns.sort_values().tolist()
    print("day's pitch data:", pitches_today.shape)

    # Get the training data raw features from the project
    # cols_train = pitches_train.columns.sort_values()
    fl = [fl for fl in project.get_featurelists() if fl.name == 'Raw Features'][0]
    cols_train = fl.features

    cols_to_drop = [feat for feat in all_pitch_cols if feat not in cols_train]
    pitches_today = pitches_today.drop(cols_to_drop, axis=1)
    cols_pred = pitches_today.columns.tolist()

    print('pitches_pred columns len:', len(cols_pred))
    print('pitches_train columns len:', len(cols_train))
    
    return pitches_today

pitches_today = get_pitches_day(2018, 4, 1) 
pitches_today.shape

2018-04-01
http://gd2.mlb.com/components/game/mlb/year_2018/month_04/day_01/gid_2018_04_01_anamlb_oakmlb_1/
ERR: New pitcher Daniel Gossett not a reserve
WARN: Mismatched pitcher names: Daniel Gossett and Yusmeiro Petit for 605254, 433589
http://gd2.mlb.com/components/game/mlb/year_2018/month_04/day_01/gid_2018_04_01_bosmlb_tbamlb_1/
ERR: New pitcher Jake Faria not a reserve
WARN: Mismatched pitcher names: Jake Faria and Jose Alvarado for 607188, 621237
http://gd2.mlb.com/components/game/mlb/year_2018/month_04/day_01/gid_2018_04_01_chamlb_kcamlb_1/
WARN: Couldn't find inning directory at http://gd2.mlb.com/components/game/mlb/year_2018/month_04/day_01/gid_2018_04_01_chamlb_kcamlb_1/
http://gd2.mlb.com/components/game/mlb/year_2018/month_04/day_01/gid_2018_04_01_chnmlb_miamlb_1/
http://gd2.mlb.com/components/game/mlb/year_2018/month_04/day_01/gid_2018_04_01_clemlb_seamlb_1/
http://gd2.mlb.com/components/game/mlb/year_2018/month_04/day_01/gid_2018_04_01_houmlb_texmlb_1/
http://gd2.mlb.co

(3486, 32)

In [7]:
#
# Score the day's pitch data on the deployment
#
DEPLOYMENT_ID = '5bdf672f7c6f8b2939428077'  # My project's recommended model: XGBoost @ 80%

# Need to write df to file then read back in to get the request.post to work.  Not ideal.
pred_file = 'pitch_pred.csv'
pitches_today.to_csv(pred_file)
data = open(pred_file, 'rb').read()  # This works.  This is type bytes: print(type(data))
# print(data)
print('pred file shape:', pitches_today.shape)

headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictions' % (DEPLOYMENT_ID),
                                     auth=(USERNAME, API_TOKEN), data=data, headers=headers)

predictions_response.raise_for_status()
df = pd.DataFrame(predictions_response.json().get('data'))

# Flatten nested label/value dict via apply 
df['label1'] = None
df['value1'] = None
df['label2'] = None
df['value2'] = None
def func(row):
    for i, pair in enumerate(row['predictionValues']):
        name = pair.get('label')
        val = pair.get('value')
        col_name_n = 'label' + str(i+1)
        col_val_n = 'value' + str(i+1)
        row[col_name_n] = name
        row[col_val_n] = val
    return row
df_pred_out = df.apply(lambda row: func(row), axis=1)
del df_pred_out['predictionValues']
print('pred out file shape:',df_pred_out.shape)

df_pred_out.head()

pred file shape: (3486, 32)
pred out file shape: (3486, 7)


Unnamed: 0,prediction,predictionThreshold,rowId,label1,value1,label2,value2
0,0.0,0.5,0,1.0,1.734e-07,0.0,1.0
1,0.0,0.5,1,1.0,0.04232164,0.0,0.957678
2,1.0,0.5,2,1.0,0.7049791,0.0,0.295021
3,0.0,0.5,3,1.0,2.3803e-06,0.0,0.999998
4,1.0,0.5,4,1.0,0.9949994,0.0,0.005001


## Dev

In [59]:
# Use the deployed model for predictions
DEPLOYMENT_ID = '5bdb94127c6f8b72d54280b3'
# MODEl_ID = '5bdb8a79c8c82a1395d4a9c3'  # Light GMB 64%

ROUTE = 'https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0'
USERNAME = 'matthew.cohen@datarobot.com'

# Set HTTP headers
# Note: The charset should match the contents of the file.
headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
# headers = {'Content-Type': 'text/plain; charset=UTF-8', 'Authorization': 'token %s' % API_TOKEN}

headers2 = {'Content-Type': 'application/json', 'Authorization': 'token %s' % API_TOKEN}
headers3 = {'Content-Type': 'application/json', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}

# health_response = requests.get('%s/modelDeployments/%s/' % (ENDPOINT, DEPLOYMENT_ID), headers=headers2)
# pprint(health_response.json())

# data = open(sys.argv[1], 'rb').read()
sub_sample = pitches_pred.sample(5)
data = sub_sample.to_json()
data
# data = pitches_pred

# Make predictions on your data
# The URL has the following format:
#     https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/<DEPLOYMENT_ID>/predictions
# See docs for details:
#     app.datarobot.com/docs/users-guide/deploy/api/new-prediction-api.html
predictions_response = requests.post('%s/deployments/%s/predictions' % (ROUTE, DEPLOYMENT_ID),
                                     auth=(USERNAME, API_TOKEN), 
                                     data=data, headers=headers)

# predictions_response.raise_for_status()
# print(predictions_response.json())
# data
predictions_response

<Response [422]>

<Response [401]>