See "Linear model with Estimators" and info about tensorflow Datasets

In [1]:
import tensorflow as tf
import tensorflow.feature_column as fc

import numpy as np
import pandas as pd


tf.enable_eager_execution()     

# BATCHING

In [2]:
BATCH_SIZE = 4
x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)

iter = dataset.make_one_shot_iterator()
el = iter.get_next()

In [3]:
with tf.Session() as sess: # doesn't work when eager_execution is on
    print("1st:")
    print(sess.run(el)) 
    print("2nd:")
    print(sess.run(el)) 

1st:
[[0.29056874 0.61646245]
 [0.71409457 0.61134421]
 [0.40056018 0.88109308]
 [0.41715577 0.83150932]]
2nd:
[[0.72923825 0.776149  ]
 [0.85322457 0.26545222]
 [0.33889616 0.84120499]
 [0.08851575 0.68703991]]


# Repeat

We can specify the number of times we want the dataset to be iterated.

# Shuffle

In [3]:
# BATCHING
BATCH_SIZE = 4
x = np.array([[1],[2],[3],[4]])
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(BATCH_SIZE)

iter = dataset.make_one_shot_iterator()
el = iter.get_next()

In [44]:
with tf.Session() as sess: # doesn't work when eager_execution is on
    print(sess.run(el))

[[1]
 [3]
 [2]
 [4]]


Dict and DataFrame

In [4]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)

In [5]:
type(dict(df)["col1"])

pandas.core.series.Series

In [6]:
type(df.to_dict())

dict

# Take()

Numpy array

In [5]:
x = np.random.sample((100,2))
y = np.random.sample((100,1))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices((x,y))

In [6]:
dataset.take(1)

<TakeDataset shapes: ((2,), (1,)), types: (tf.float64, tf.float64)>

In [8]:
for feature_batch, label_batch in dataset.take(1):
    print(feature_batch)
    print(label_batch)

tf.Tensor([0.11510448 0.90050657], shape=(2,), dtype=float64)
tf.Tensor([0.92966304], shape=(1,), dtype=float64)


Dict

In [23]:
x = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=x)
y = {'output': [1, 2]}
df_labels = pd.DataFrame(data=y)

In [50]:
dataset = tf.data.Dataset.from_tensor_slices((df.values, df_labels.values)) # what the fuck? How to do it without values?

In [42]:
dataset.take(1)

<TakeDataset shapes: ((2,), (1,)), types: (tf.int64, tf.int64)>

In [43]:
for feature_batch, label_batch in dataset.take(1):
    print(type(feature_batch.keys()))
    print(type(list(feature_batch.keys())))
    print(type(feature_batch))
    print(feature_batch['col1']) # output 1 as there is in new ds only one example
    print(label_batch)

AttributeError: 'EagerTensor' object has no attribute 'keys'

# Numeric columns

In [81]:
x = {'status': ['broke', 'middle', 'rich', 'rapper', 'rapper', 'rich'],
     'col2': [3, 4, 4, 5, 6, 7], 
     'age':[12, 20, 24, 27, 40, 65]}
y = {'output': [1, 2, 3, 4, 5, 6]}

In [82]:
ds = tf.data.Dataset.from_tensor_slices((x,y))

In [83]:
#for feature_batch, label_batch in ds.take(1):
#   print("OK")

In [84]:
col2 = fc.numeric_column('col2')
fc.input_layer(x, [col2]).numpy() # what does it do? Just input tensor?

array([[3.],
       [4.],
       [4.],
       [5.],
       [6.],
       [7.]], dtype=float32)

# Categorical Columns

In [135]:
status = fc.categorical_column_with_vocabulary_list(
    'status',
    ['broke', 'middle', 'rich', 'rapper']) # asossiate 0 to 3

In [136]:
fc.input_layer(x # is equal to feature batch
               , [col2, fc.indicator_column(status)])

<tf.Tensor: id=1018, shape=(6, 5), dtype=float32, numpy=
array([[3., 1., 0., 0., 0.],
       [4., 0., 1., 0., 0.],
       [4., 0., 0., 1., 0.],
       [5., 0., 0., 0., 1.],
       [6., 0., 0., 0., 1.],
       [7., 0., 0., 1., 0.]], dtype=float32)>

# Categorical Columns with Hash

when we don't know the number of classes

In [137]:
status_hash = tf.feature_column.categorical_column_with_hash_bucket(
    'status', hash_bucket_size=5) # why does it sometimes have the same values for different categories

In [140]:
status_result = fc.input_layer(x, [fc.indicator_column(status_hash)])
print(status_result.numpy().shape)
status_result.numpy() # hash code one-hot vector for every class 

(6, 5)


array([[0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)

In [141]:
tf.argmax(status_result, axis=1).numpy()

array([3, 1, 0, 4, 4, 0], dtype=int64)

# Bucketized columns

we use boundaries and ranges  instead of numbers

In [143]:
age = fc.numeric_column('age')
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[20, 30])

In [145]:
fc.input_layer(x, [age, age_buckets]).numpy # 3 ranges

<bound method _EagerTensorBase.numpy of <tf.Tensor: id=1126, shape=(6, 4), dtype=float32, numpy=
array([[12.,  1.,  0.,  0.],
       [20.,  0.,  1.,  0.],
       [24.,  0.,  1.,  0.],
       [27.,  0.,  1.,  0.],
       [40.,  0.,  0.,  1.],
       [65.,  0.,  0.,  1.]], dtype=float32)>>

# Crossed_columns

In [99]:
x = {'education': ['7th', '8th', '10th', '10th', '6th', '1th'],
     'occupation': ['pro', 'sport', 'sci', 'model', 'pro', 'tech'],}
x = pd.DataFrame(data = x)
y = {'output': [1, 2, 3, 4, 5, 6]}
y = pd.DataFrame(data = y)
print(x.values)

[['7th' 'pro']
 ['8th' 'sport']
 ['10th' 'sci']
 ['10th' 'model']
 ['6th' 'pro']
 ['1th' 'tech']]


In [100]:
education = fc.categorical_column_with_vocabulary_list(
    'education',
    ['7th', '8th', '10th', '6th', '1th'])
occupation = fc.categorical_column_with_vocabulary_list(
    'occupation',
    ['pro', 'sport', 'sci', 'model', 'tech'])

#education = fc.indicator_column(education)

#occupation = fc.indicator_column(occupation)

In [114]:
education_x_occupation = tf.feature_column.crossed_column(
    ['education', 'occupation'], 30)

In [110]:
#Which method is correct?
#education_x_occupation  = tf.feature_column.crossed_column(
#    [education, occupation], 30)

In [111]:
feat_cols = [
    education,
    occupation,
    education_x_occupation]
print(education_x_occupation)

_CrossedColumn(keys=(_VocabularyListCategoricalColumn(key='education', vocabulary_list=('7th', '8th', '10th', '6th', '1th'), dtype=tf.string, default_value=-1, num_oov_buckets=0), _VocabularyListCategoricalColumn(key='occupation', vocabulary_list=('pro', 'sport', 'sci', 'model', 'tech'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), hash_bucket_size=30, hash_key=None)


In [112]:
fc.input_layer(x, feat_cols)

ValueError: Items of feature_columns must be a _DenseColumn. You can wrap a categorical column with an embedding_column or indicator_column. Given: _VocabularyListCategoricalColumn(key='education', vocabulary_list=('7th', '8th', '10th', '6th', '1th'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

Another way

In [122]:
# From stackoverflow
feat_cols = [
    #fc.indicator_column(education),
    #fc.indicator_column(occupation),
    fc.indicator_column(education_x_occupation)]

example_data = fc.input_layer(x, feat_cols) # output
print(example_data.numpy())

TypeError: 'numpy.ndarray' object is not callable

In [123]:
#Model
classifier = tf.estimator.DNNClassifier(
        feature_columns=feat_cols,
        hidden_units=[2, 10],
        n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_tf_random_seed': None, '_service': None, '_num_ps_replicas': 0, '_log_step_count_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000022BAD947D30>, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_num_worker_replicas': 1, '_master': '', '_device_fn': None, '_task_id': 0, '_global_id_in_cluster': 0, '_is_chief': True, '_keep_checkpoint_max': 5, '_save_summary_steps': 100, '_train_distribute': None, '_session_config': None, '_task_type': 'worker', '_evaluation_master': '', '_model_dir': 'C:\\Users\\Nuclear\\AppData\\Local\\Temp\\tmp8gfumv_6'}


In [59]:
df = pd.DataFrame(x)
a =df[0:3].copy()
print(type(a))

<class 'pandas.core.frame.DataFrame'>


# lambda test

In [64]:
a = lambda: 3

In [66]:
a()

3

In [87]:
def sqrt_(x):
    return np.sqrt(x)

fun = lambda: sqrt_(x) # function with no argument

x = 3
print(fun()) 

1.7320508075688772


In [88]:
mult3 = filter(lambda x: x % 3 == 0, [1, 2, 3, 4, 5, 6, 7, 8, 9])
list(mult3)

[3, 6, 9]

In [92]:
def transform(n):
     return lambda x: x + n

f = transform(3)
print(f(4))
print(transform(3)(4))

7
7


In [80]:
classes = np.array(['<=50K', '>50K'])
pred_class_id = [1, 0,0, 1, 1]
classes[np.array(pred_class_id)] # the same as clsses[i] where i = 0 or 1

array(['>50K', '<=50K', '<=50K', '>50K', '>50K'], dtype='<U5')