In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

Read and inspect code

In [2]:
train_data = pd.read_csv('ncaa/train.csv')
eval_data = pd.read_csv('ncaa/eval.csv')
test_data = pd.read_csv('ncaa/test.csv')
train_data.head()

Unnamed: 0,season,label,seed,school_ncaa,opp_seed,opp_ncaa
0,1994,0,5,Indiana,9,Boston College
1,2007,0,10,Texas Tech,7,Boston College
2,2001,0,14,Southern Utah,3,Boston College
3,1994,0,8,Washington St.,9,Boston College
4,1994,0,1,North Carolina,9,Boston College


In [3]:
train_data.describe()

Unnamed: 0,season,label,seed,opp_seed
count,3966.0,3966.0,3966.0,3966.0
mean,2000.15885,0.5,6.795512,6.795512
std,8.98352,0.500063,4.608265,4.608265
min,1985.0,0.0,1.0,1.0
25%,1992.0,0.0,3.0,3.0
50%,2000.0,0.5,6.0,6.0
75%,2008.0,1.0,11.0,11.0
max,2015.0,1.0,16.0,16.0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3966 entries, 0 to 3965
Data columns (total 6 columns):
season         3966 non-null int64
label          3966 non-null int64
seed           3966 non-null int64
school_ncaa    3966 non-null object
opp_seed       3966 non-null int64
opp_ncaa       3966 non-null object
dtypes: int64(4), object(2)
memory usage: 186.0+ KB


Set up feature columns:

In [5]:
seed = tf.feature_column.numeric_column(key='seed')

In [6]:
opp_seed = tf.feature_column.numeric_column(key='opp_seed')

In [7]:
vocabulary = pd.read_csv('school_vocabulary.csv',header=None)
#vocabulary.head()

In [8]:
vocabulary_list = [x for x in vocabulary[1]]
# vocabulary_list

In [9]:
school_ncaa = tf.feature_column.categorical_column_with_vocabulary_list(
    key='school_ncaa',
    vocabulary_list=vocabulary_list,
    default_value=0)

In [10]:
opp_ncaa = tf.feature_column.categorical_column_with_vocabulary_list(
    key='opp_ncaa',
    vocabulary_list=vocabulary_list,
    default_value=0)

In [11]:
feature_columns = [seed, school_ncaa, opp_seed,  opp_ncaa]
#feature_columns

Set up features and labels:

In [12]:
feature_names = ['seed', 'school_ncaa', 'opp_seed','opp_ncaa']
label_name = 'label'

In [13]:
train_features = train_data[feature_names]
train_labels = train_data[label_name]

In [14]:
eval_features = eval_data[feature_names]
eval_labels = eval_data[label_name]

In [15]:
test_features = test_data[feature_names]
test_labels = test_data[label_name]

Input function:

In [16]:
train_features.head()

Unnamed: 0,seed,school_ncaa,opp_seed,opp_ncaa
0,5,Indiana,9,Boston College
1,10,Texas Tech,7,Boston College
2,14,Southern Utah,3,Boston College
3,8,Washington St.,9,Boston College
4,1,North Carolina,9,Boston College


In [17]:
train_features['school_ncaa'].values

array(['Indiana', 'Texas Tech', 'Southern Utah', ..., 'Wichita St.',
       'Wichita St.', 'Wichita St.'], dtype=object)

In [18]:
train_labels.values

array([0, 0, 0, ..., 1, 1, 1])

In [19]:
# def train_input():
#     input_tensor = {}
#     for name in feature_names:
#         input_tensor[name] = train_features[name]
#     #print(input_tensor)
#     _dataset = tf.data.Dataset.from_tensor_slices((input_tensor,train_labels))
#     dataset = _dataset.batch(32)
#     iterator = dataset.make_one_shot_iterator()
#     features, labels = iterator.get_next() #breaking up output into little batches/shards... like using a generator
#     return features, labels
def train_input():
    _dataset = tf.data.Dataset.from_tensor_slices(({'seed':train_features['seed'].values, 'school_ncaa':train_features['school_ncaa'].values, 'opp_seed':train_features['opp_seed'].values, 'opp_ncaa':train_features['opp_ncaa'].values}, train_labels.values))
    dataset = _dataset.batch(32)
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels

In [20]:
# def eval_input():
#     input_tensor = {}
#     for name in feature_names:
#         input_tensor[name] = eval_features[name]
#     _dataset = tf.data.Dataset.from_tensor_slices((input_tensor,eval_labels))
#     dataset = _dataset.batch(32)
#     iterator = dataset.make_one_shot_iterator()
#     features, labels = iterator.get_next() #breaking up output into little batches/shards... like using a generator
#     return features, labels
def eval_input():
    _dataset = tf.data.Dataset.from_tensor_slices(({'seed':eval_features['seed'].values, 'school_ncaa':eval_features['school_ncaa'].values, 'opp_seed':eval_features['opp_seed'].values, 'opp_ncaa':eval_features['opp_ncaa'].values}, eval_labels.values))
    dataset = _dataset.batch(32)
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels

In [21]:
# def test_input():
#     input_tensor = {}
#     for name in feature_names:
#         input_tensor[name] = test_features[name]
#     _dataset = tf.data.Dataset.from_tensor_slices((input_tensor,test_labels))
#     dataset = _dataset.batch(32)
#     iterator = dataset.make_one_shot_iterator()
#     features, labels = iterator.get_next() #breaking up output into little batches/shards... like using a generator
#     return features, labels
def test_input():
    _dataset = tf.data.Dataset.from_tensor_slices(({'seed':test_features['seed'].values, 'school_ncaa':test_features['school_ncaa'].values, 'opp_seed':test_features['opp_seed'].values, 'opp_ncaa':test_features['opp_ncaa'].values}, test_labels.values))
    dataset = _dataset.batch(32)
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels

Build estimator:

In [22]:
estimator = tf.estimator.LinearClassifier(
    feature_columns=feature_columns,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.00001),
    model_dir='output'
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2e888e4b38>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


training = estimator.train(input_fn=train_input,
                           steps=12000)
print(training)

evaluation = estimator.evaluate(input_fn=eval_input)
print(evaluation)

prediction = estimator.predict(input_fn=test_input)
print(prediction)

p = [item['probabilities'][0] for item in prediction]


print(pd.DataFrame({'proba':p}))

In [23]:
train_spec = tf.estimator.TrainSpec(input_fn=train_input,max_steps=12000)

In [24]:
def serving_input_fn():
    feature_placeholders = {
        'seed': tf.placeholder(tf.int64),
        'school_ncaa':tf.placeholder(tf.string),
        'opp_seed': tf.placeholder(tf.int64),
        'opp_ncaa':tf.placeholder(tf.string)
    }
    features = {key:tf.expand_dims(tensor,-1) for key, tensor in feature_placeholders.items()}
    return tf.estimator.export.ServingInputReceiver(features,feature_placeholders)

In [25]:
exporter = tf.estimator.LatestExporter(name='Exporter',serving_input_receiver_fn=serving_input_fn)

In [26]:
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input,
                                  steps=None,
                                  start_delay_secs=1,
                                  exporters=exporter,
                                 throttle_secs=10)

In [27]:
tf.estimator.train_and_evaluate(estimator=estimator,train_spec=train_spec,eval_spec=eval_spec)

INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into output/model.ckpt.
INFO:tensorflow:loss = 22.18071, step = 1
INFO:tensorflow:global_step/sec: 428.518
INFO:tensorflow:loss = 22.259356, step = 101 (0.234 sec)
INFO:tensorflow:Saving checkpoints for 124 into output/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-16-11:05:04
INFO:tensorflow

({'accuracy': 0.5,
  'accuracy_baseline': 0.5,
  'auc': 0.5,
  'auc_precision_recall': 0.75,
  'average_loss': 0.69282275,
  'label/mean': 0.5,
  'loss': 18.56765,
  'precision': 0.0,
  'prediction/mean': 0.49928802,
  'recall': 0.0,
  'global_step': 124},
 [b'output/export/Exporter/1555412704'])