In [9]:
from __future__ import print_function

import collections
import io
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython import display
import tensorflow as tf
from sklearn import metrics

tf.logging.set_verbosity(tf.logging.ERROR)
train_url = 'https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/train.tfrecord'
test_url='https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/test.tfrecord'

train_path = tf.keras.utils.get_file(train_url.split('/')[-1], train_url)
print(train_path)

test_path = tf.keras.utils.get_file(test_url.split('/')[-1], test_url)
print(test_path)

/Users/xinwang/.keras/datasets/train.tfrecord
/Users/xinwang/.keras/datasets/test.tfrecord


In [24]:
def _parse_function(record):
    features = {
        "terms":tf.VarLenFeature(dtype=tf.string),
        "labels":tf.FixedLenFeature(shape=[1], dtype=tf.float32)
    }
    
    parsed_features = tf.parse_single_example(record, features)
    
    terms = parsed_features['terms'].values
    labels = parsed_features['labels']
    
    return {'terms':terms}, labels


ds = tf.data.TFRecordDataset(train_path)

ds = ds.map(_parse_function)

n = ds.make_one_shot_iterator().get_next()

with tf.Session() as sess:
    for i in range(2):
        print(sess.run(n))

({'terms': array([b'but', b'it', b'does', b'have', b'some', b'good', b'action',
       b'and', b'a', b'plot', b'that', b'is', b'somewhat', b'interesting',
       b'.', b'nevsky', b'acts', b'like', b'a', b'body', b'builder',
       b'and', b'he', b'isn', b"'", b't', b'all', b'that', b'attractive',
       b',', b'in', b'fact', b',', b'imo', b',', b'he', b'is', b'ugly',
       b'.', b'(', b'his', b'acting', b'skills', b'lack', b'everything',
       b'!', b')', b'sascha', b'is', b'played', b'very', b'well', b'by',
       b'joanna', b'pacula', b',', b'but', b'she', b'needed', b'more',
       b'lines', b'than', b'she', b'was', b'given', b',', b'her',
       b'character', b'needed', b'to', b'be', b'developed', b'.',
       b'there', b'are', b'way', b'too', b'many', b'men', b'in', b'this',
       b'story', b',', b'there', b'is', b'zero', b'romance', b',', b'too',
       b'much', b'action', b',', b'and', b'way', b'too', b'dumb', b'of',
       b'an', b'ending', b'.', b'it', b'is', b'very', b'vio

In [20]:
def _input_fn(input_filenames, n_epochs = None, shuffle=True):
    ds = tf.data.TFRecordDataset(input_filenames)
    ds = ds.map(_parse_function)
    
    if shuffle:
        ds = ds.shuffle(10000)
    
    ds = ds.padded_batch(25, ds.output_shapes)
    
    ds = ds.repeat(n_epochs)
    
    features, labels = ds.make_one_shot_iterator().get_next()
    
    return features, labels


informative_terms = ("bad", "great", "best", "worst", "fun", "beautiful",
                     "excellent", "poor", "boring", "awful", "terrible",
                     "definitely", "perfect", "liked", "worse", "waste",
                     "entertaining", "loved", "unfortunately", "amazing",
                     "enjoyed", "favorite", "horrible", "brilliant", "highly",
                     "simple", "annoying", "today", "hilarious", "enjoyable",
                     "dull", "fantastic", "poorly", "fails", "disappointing",
                     "disappointment", "not", "him", "her", "good", "time",
                     "?", ".", "!", "movie", "film", "action", "comedy",
                     "drama", "family")

terms_feature_column = tf.feature_column.categorical_column_with_vocabulary_list(key='terms',
                                                                                vocabulary_list=informative_terms)

my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

feature_columns = [terms_feature_column]

classifier = tf.estimator.LinearClassifier(feature_columns = feature_columns,
                                          optimizer = my_optimizer)
classifier.train(input_fn=lambda: _input_fn([train_path]),
                steps=1000)

evaluation_metrics = classifier.evaluate(input_fn=lambda: _input_fn([train_path]),
                                        steps=1000)

print('Training set metrics:')
for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
print('-------')


evaluation_metrics = classifier.evaluate(input_fn=lambda: _input_fn([test_path]),
                                        steps=1000)

print('Test set metrics:')
for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
print('--------')
    

Training set metrics:
accuracy 0.78592
accuracy_baseline 0.5
auc 0.8710361
auc_precision_recall 0.8641522
average_loss 0.45636493
label/mean 0.5
loss 11.409123
precision 0.7946414
prediction/mean 0.4681327
recall 0.77112
global_step 1000
-------
Test set metrics:
accuracy 0.7798
accuracy_baseline 0.5
auc 0.8682654
auc_precision_recall 0.8616555
average_loss 0.45892027
label/mean 0.5
loss 11.473007
precision 0.7889779
prediction/mean 0.46761262
recall 0.76392
global_step 1000
--------
