In [None]:
import os
os.chdir('/home/megatron/work')

import megatron
import pandas as pd
import numpy as np
import sqlite3

In [None]:
# example using pandas
generator = True
lahman_file = 'data/lahman_csv/core/Batting.csv'
lahman = pd.read_csv(lahman_file)
exclude = ['playerID','yearID','stInt','teamID','lgID']

if generator:
    lahman_generator = megatron.data.PandasGenerator(lahman, 1000, exclude_cols=exclude)
else:
    lahman_data = megatron.data.PandasData(lahman, exclude_cols=exclude)

In [None]:
# example using csv
generator = True
lahman_file = 'data/lahman_csv/core/Batting.csv'
exclude = ['playerID','yearID','stInt','teamID','lgID']

if generator:
    lahman_generator = megatron.data.CSVGenerator(lahman_file, 1000, exclude_cols=exclude)
else:
    lahman_data = megatron.data.CSVData(lahman_file, exclude_cols=exclude)

In [None]:
# example using database
generator = True
lahman_df = pd.read_csv('data/lahman_csv/core/Batting.csv')
conn = sqlite3.connect('lahman')
conn.execute('DROP TABLE IF EXISTS batting')
lahman_df.to_sql('batting', conn, index=False)
query = 'SELECT * FROM batting'

if generator:
    lahman_generator = megatron.data.SQLGenerator(conn, query, 1000)
else:
    lahman_data = megatron.data.SQLData(conn, query)

In [None]:
exclude = ['playerID','yearID','stInt','teamID','lgID']
inputs = megatron.nodes.from_csv(lahman_file, exclude_cols=exclude, eager=True)

fillna = megatron.layers.Impute({np.nan: 0}, name='fillna')
inputs = fillna(inputs)
inputs = megatron.layers.Cast(np.int)(inputs)

# helpers
def single_fn(h, d, t, hr):
    return h - d - t - hr
singles = megatron.layers.Lambda(single_fn, name='Singles')(inputs[['H','2B','3B','HR']])
hit_types = megatron.layers.Concatenate('hit_types')([singles] + inputs[['2B','3B','HR']])
TB = megatron.layers.Dot(W=np.array([1,2,3,4]), name='TB')(hit_types)

# basics
PA = megatron.layers.Add(name='PA')(inputs[['AB', 'BB', 'HBP', 'SH', 'SF']])
BBp = megatron.layers.Divide(name='BB%')([inputs['BB'], PA])
Kp = megatron.layers.Divide(name='K%')([inputs['SO'], PA])
def obp(h, bb, hbp, ab, sf):
    return megatron.helpers.safe_divide(h + bb + hbp, ab + bb + hbp + sf)
OBP = megatron.layers.Lambda(obp, name='OBP')(inputs[['H','BB','HBP','AB','SF']])
SLG = megatron.layers.Divide(name='SLG')([TB, inputs['AB']])
AVG = megatron.layers.Divide(name='AVG')(inputs[['H', 'AB']])
ISO = megatron.layers.Subtract(name='ISO')([SLG, AVG])
def babip(h, hr, ab, k, sf):
    return megatron.helpers.safe_divide(h - hr, ab - k - hr + sf)
BABIP = megatron.layers.Lambda(babip, name='BABIP')(inputs[['H','HR','AB','SO','SF']])

outputs = [PA, BBp, Kp, OBP, SLG, AVG, ISO, BABIP]

outputs = megatron.nodes.FeatureSet(outputs)
outputs = megatron.layers.Lambda(np.round, decimals=2)(outputs)

In [None]:
P = megatron.Pipeline(inputs, outputs)

if generator:
    P.fit_generator(lahman_generator)
    out = P.transform_generator(lahman_generator, format='dataframe')
else:
    P.fit(lahman_data)
    out = P.transform(lahman_data)
    
out

In [None]:
megatron.visuals.pipeline_imsave(P, 'img/sabermetrics.png')
megatron.visuals.pipeline_imshow(P)