In [1]:
import tensorflow as tf
from tensorflow import keras

In [23]:
# simple tensor
x = tf.range(10)
x

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)>

In [24]:
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [25]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [26]:
# chaining 
# repeats the data but donot make copy
dataset = dataset.repeat(2).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9], shape=(6,), dtype=int32)


In [27]:
dataset = dataset.map(lambda x:x * 2)
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18], shape=(6,), dtype=int32)


In [28]:
# unbatching the data
dataset = dataset.unbatch()

In [29]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)


In [30]:
# apply is left

In [31]:
dataset = dataset.filter(lambda x: x < 15)
for item in dataset.take(5): # take - just print 5 instances
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [32]:
import pandas as pd

In [53]:
i = 1
cols = ["longitude","latitude","housing_median_age","total_rooms","total_bedrooms",
        "population","households","median_income","median_house_value"]
for chunk in pd.read_csv("data/housing.csv", usecols=cols, chunksize=1000):
    if i < 13:
        chunk.to_csv(f"data/train_{i:02d}.csv", index=False)
    elif 13 < i < 16:
        chunk.to_csv(f"data/valid_{i:02d}.csv", index=False)
    else:
        chunk.to_csv(f"data/test_{i:02d}.csv", index=False)
    print(f"{i} chunk is created")
    i = i + 1

1 chunk is created
2 chunk is created
3 chunk is created
4 chunk is created
5 chunk is created
6 chunk is created
7 chunk is created
8 chunk is created
9 chunk is created
10 chunk is created
11 chunk is created
12 chunk is created
13 chunk is created
14 chunk is created
15 chunk is created
16 chunk is created
17 chunk is created
18 chunk is created
19 chunk is created
20 chunk is created
21 chunk is created


In [54]:
import glob
train_filepath = sorted(glob.glob("data/*train*"))
valid_filepath = sorted(glob.glob("data/*valid*"))
test_filepath = sorted(glob.glob("data/*test*"))

In [55]:
train_files

['data/train_01.csv',
 'data/train_02.csv',
 'data/train_03.csv',
 'data/train_04.csv',
 'data/train_05.csv',
 'data/train_06.csv',
 'data/train_07.csv',
 'data/train_08.csv',
 'data/train_09.csv',
 'data/train_10.csv',
 'data/train_11.csv',
 'data/train_12.csv']

In [56]:
pd.read_csv(train_filepath[8]).head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-118.13,33.85,36.0,2110.0,416.0,1128.0,403.0,4.6019,208400.0
1,-118.13,33.85,36.0,1885.0,391.0,1049.0,405.0,3.55,212800.0
2,-118.14,33.86,36.0,1774.0,348.0,934.0,333.0,4.8571,203300.0
3,-118.14,33.86,36.0,1703.0,325.0,845.0,308.0,5.0106,210800.0
4,-118.1,33.85,28.0,2825.0,470.0,1352.0,469.0,5.2639,242000.0


In [57]:
# in text mode
with open(train_filepath[2], "r") as f:
    for i in range(5):
        print(f.readline(), end="")

longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
-119.78,36.74,15.0,1461.0,415.0,924.0,356.0,2.5045,90300.0
-119.78,36.75,35.0,2114.0,506.0,2050.0,474.0,1.2375,50000.0
-119.78,36.75,31.0,1404.0,379.0,1515.0,387.0,1.2813,56400.0
-119.79,36.74,35.0,853.0,296.0,1228.0,289.0,1.0513,39600.0


In [58]:
# list_files functions shuffles the file path automaticlly
file_path = tf.data.Dataset.list_files(train_filepath, seed=42) 

In [61]:
# interleave methods calls certin number of file to read from
# TextLineDataset read from files one by one
n_reader = 5
dataset = file_path.interleave(lambda file_path: tf.data.TextLineDataset(file_path).skip(1),
                               cycle_length=n_reader)

In [62]:
for line in dataset.take(10):
    print(line.numpy())

b'-119.03,35.3,10.0,829.0,146.0,447.0,173.0,4.1484,102900.0'
b'-119.78,36.74,15.0,1461.0,415.0,924.0,356.0,2.5045,90300.0'
b'-121.76,37.69,29.0,3433.0,711.0,1919.0,709.0,3.3841,184400.0'
b'-118.04,33.96,42.0,1430.0,338.0,1269.0,321.0,3.3214,148800.0'
b'-118.13,33.85,36.0,2110.0,416.0,1128.0,403.0,4.6019,208400.0'
b'-119.02,35.3,10.0,7397.0,1369.0,4611.0,1310.0,3.6369,81600.0'
b'-119.78,36.75,35.0,2114.0,506.0,2050.0,474.0,1.2375,50000.0'
b'-121.77,37.68,36.0,1687.0,372.0,950.0,372.0,3.5532,158400.0'
b'-118.05,33.96,37.0,2622.0,652.0,2778.0,644.0,2.9714,160300.0'
b'-118.13,33.85,36.0,1885.0,391.0,1049.0,405.0,3.55,212800.0'
