In [1]:
# 先生成大量的文件. 
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

data = housing.data
target = housing.target

from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(data, target, random_state=7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=10)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

### 生成大量csv文件

In [3]:
x = np.arange(9)
np.array_split(x, 3)

[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]

In [4]:
len(x_train_scaled)

11610

In [5]:
x_train_scaled.shape

(11610, 8)

In [6]:
x_train_scaled[0]

array([-0.22763171, -1.80227442, -0.2591649 , -0.08360651, -0.59848384,
       -0.12716668,  1.10047447, -0.88169018])

In [7]:
np.arange(len(data))

array([    0,     1,     2, ..., 20637, 20638, 20639])

In [9]:
x_train_scaled[[0, 1, 2, 3]]

array([[-0.22763171, -1.80227442, -0.2591649 , -0.08360651, -0.59848384,
        -0.12716668,  1.10047447, -0.88169018],
       [-1.37848743, -1.72278134, -0.75635644, -0.08560269, -0.20853523,
         0.14983419, -1.45661346,  1.2763948 ],
       [-1.31113536,  1.85440709, -0.15558667, -0.26748997, -1.11784397,
        -0.01745359,  1.06300798, -1.3874131 ],
       [-0.64748804,  0.8209971 , -0.66607127, -0.34479833, -0.95243719,
         0.06288352, -0.76816671,  0.70557882]])

In [12]:
','.join([str(col) for col in  x_train_scaled[0]])

'-0.22763170751337458,-1.802274419767064,-0.25916490468728715,-0.08360651274598882,-0.5984838423044915,-0.1271666847035384,1.1004744650918357,-0.881690175316059'

In [10]:
import os


output_dir = 'generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    

def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, '{}_{:02d}.csv')
    filenames = []
    
    # 对索引进行划分成n_parts份, 然后用索引去data中取数据
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        
        # 写数据
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header is not None:
                f.write(header + '\n')
            for row_index in row_indices:
                f.write(','.join([repr(col) for col in  data[row_index]]))
                f.write('\n')
                
    return filenames

In [13]:
# 把样本数据和对应的标记合并到一起. 
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

In [14]:
train_data

array([[-0.22763171, -1.80227442, -0.2591649 , ...,  1.10047447,
        -0.88169018,  1.238     ],
       [-1.37848743, -1.72278134, -0.75635644, ..., -1.45661346,
         1.2763948 ,  0.675     ],
       [-1.31113536,  1.85440709, -0.15558667, ...,  1.06300798,
        -1.3874131 ,  1.042     ],
       ...,
       [ 3.873586  ,  0.26454556,  0.92499504, ..., -0.74006684,
         0.87081503,  5.00001   ],
       [-0.88177929, -0.5303852 , -0.82283667, ..., -0.66981717,
         0.55536409,  0.938     ],
       [-0.26114763,  0.02606633, -0.30457834, ..., -0.64640062,
         0.57539272,  1.424     ]])

In [16]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [17]:
header_cols = housing.feature_names + ['MedianHouseValue']
header_str = ','.join(header_cols)

In [18]:
header_str

'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'

In [None]:
# 1. filenames -> dataset
# 2. interleave -> read file -> dataset
# 3. parse csv

In [19]:
train_filenames = save_to_csv(output_dir, train_data, 'train', header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, 'valid', header_str, n_parts=20)
test_filenames = save_to_csv(output_dir, test_data, 'test', header_str, n_parts=20)

In [20]:
# 对于文件名做dataset专门有一个api, 叫做list_files
filename_dataset = tf.data.Dataset.list_files(train_filenames)

In [21]:
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\

In [22]:
# 对filename_dataset中的每一个文件进行读取. 
n_readers = 5

dataset = filename_dataset.interleave(
    # map_fn
    # skip(1)跳过表头
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers
)

In [24]:
x_train_scaled[0]

array([-0.22763171, -1.80227442, -0.2591649 , -0.08360651, -0.59848384,
       -0.12716668,  1.10047447, -0.88169018])

In [23]:
for line in dataset.take(15):
    print(line.numpy())

b'-0.8581366919491114,-0.8483575045964773,-0.10169260162102436,-0.1607259534248392,-0.22653286196012368,0.0572667062111848,-1.348897299826622,2.0374827735260688,0.792'
b'-0.22763170751337458,-1.802274419767064,-0.25916490468728715,-0.08360651274598882,-0.5984838423044915,-0.1271666847035384,1.1004744650918357,-0.881690175316059,1.238'
b'-0.24615085679382154,-0.9278505808606928,-0.40923962585466006,0.016045584313930498,1.1412868722094869,0.0463863961225972,0.8335257258942214,-1.1470695343017077,2.458'
b'-1.1497999507895804,0.8209970969520494,-0.48712584019556754,-0.13433261994390244,0.5327956370839083,0.017342235846994018,0.9318752613880796,-0.6964253397977812,0.65'
b'4.423823550949558,-0.4508921232753995,1.0152171796735139,-0.16929413259492634,2.0651651137100138,-0.03330117175555653,0.8007425473962686,-1.277255634936179,5.00001'
b'1.1684026062558825,-1.8817674960312796,1.190417643333094,0.2851270212887215,-0.7193250594209796,0.09480023482861354,0.95997512867204,-0.9267545947664537,2.15

In [27]:
# 解析csv
# 即把b'-0.8581366919491114,-0.8483575045964773,-0.10169260162102436,-0.1607259534248392,-0.22653286196012368,0.0572667062111848,-1.348897299826622,2.0374827735260688,0.792'
# 变成可以训练的数据.
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(1, dtype=tf.int32)] * 5
tf.io.decode_csv(sample_str, record_defaults)

[<tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=int32, numpy=2>,
 <tf.Tensor: shape=(), dtype=int32, numpy=3>,
 <tf.Tensor: shape=(), dtype=int32, numpy=4>,
 <tf.Tensor: shape=(), dtype=int32, numpy=5>]

In [None]:
None 和np.nan的区别

In [28]:
# 封装解析csv的函数
def parse_csv_line(line, n_fields=9):
    record_defaults = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

In [30]:
def parse_csv_line_origin(line):
    line = line.decode()
    data = [np.array(float(x)) for x in line.split(',')]
    x = np.stack(data[0:-1])
    y = np.stack(data[-1:])
    return x, y

In [31]:
line = b'-0.8581366919491114,-0.8483575045964773,-0.10169260162102436,-0.1607259534248392,-0.22653286196012368,0.0572667062111848,-1.348897299826622,2.0374827735260688,0.792'
parse_csv_line(line)

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-0.8581367 , -0.8483575 , -0.1016926 , -0.16072595, -0.22653286,
         0.0572667 , -1.3488973 ,  2.0374827 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.792], dtype=float32)>)

In [32]:
parse_csv_line_origin(line)

(array([-0.85813669, -0.8483575 , -0.1016926 , -0.16072595, -0.22653286,
         0.05726671, -1.3488973 ,  2.03748277]),
 array([0.792]))

In [40]:
# 封装所有的功能
def csv_reader_dataset(filenames, n_readers=5, batch_size=32, n_parse_threads=5, shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers
    )
    # 打乱数据
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [41]:
train_set = csv_reader_dataset(train_filenames, batch_size=32)

In [42]:
for item in train_set.take(1):
    print(item)

(<tf.Tensor: shape=(32, 8), dtype=float32, numpy=
array([[-9.68557775e-01, -9.27850604e-01, -9.93346393e-01,
        -7.95192048e-02,  2.15943837e+00,  1.30265579e-01,
        -6.79183781e-01,  6.00428522e-01],
       [ 2.32438183e+00, -5.34267426e-02,  5.54725766e-01,
        -2.40248457e-01, -8.29024911e-01, -8.19876119e-02,
         1.02085817e+00, -1.30229139e+00],
       [-7.30690777e-01, -1.40480900e+00,  4.16910082e-01,
         3.39130104e-01, -7.48385713e-02, -3.39961126e-02,
         2.85671616e+00, -2.32375169e+00],
       [-6.09115481e-01, -1.00734365e+00, -1.12950802e-01,
        -4.15341221e-02, -3.57658446e-01, -3.47044170e-02,
         1.00212491e+00, -8.41632903e-01],
       [-4.60108399e-01, -1.24582291e+00, -1.53106987e-01,
         6.31197765e-02,  4.41950470e-01,  3.88852321e-02,
        -1.13346493e+00,  1.18125880e+00],
       [-7.27435231e-01, -2.12412894e-01, -2.73457110e-01,
        -1.59449220e-01,  2.56832004e-01,  7.86850825e-02,
         1.08174121e+00, -8

In [43]:
for x_batch, y_batch in train_set.take(2):
    print(x_batch)
    print('------------------')
    print(y_batch)

tf.Tensor(
[[-6.90877259e-01  5.03024817e-01 -5.07885396e-01 -1.51979014e-01
   3.90528679e-01  1.38743110e-02  7.96059251e-01 -1.13705528e+00]
 [ 9.36673105e-01  4.23531711e-01 -2.29369681e-02 -2.91700065e-01
  -5.75344026e-01  1.38818473e-01 -8.61832917e-01  8.35764945e-01]
 [-2.46150851e-01 -9.27850604e-01 -4.09239620e-01  1.60455834e-02
   1.14128685e+00  4.63863946e-02  8.33525717e-01 -1.14706957e+00]
 [-4.33050156e-01 -4.50892121e-01 -5.07441580e-01 -6.03004396e-02
   2.27770853e+00 -4.91754226e-02 -8.94616127e-01  8.80829334e-01]
 [-4.60108399e-01 -1.24582291e+00 -1.53106987e-01  6.31197765e-02
   4.41950470e-01  3.88852321e-02 -1.13346493e+00  1.18125880e+00]
 [-1.13117409e+00 -1.88176751e+00 -8.14262927e-01 -3.32823545e-02
   1.47210038e+00  8.08585510e-02 -1.44724679e+00  1.26638043e+00]
 [ 1.81123808e-01 -2.04075360e+00  7.37920225e-01  1.76768914e-01
   7.02136898e+00  4.68837731e-02 -8.19683135e-01  1.20128739e+00]
 [ 4.57814429e-03 -1.08683670e+00 -2.29467839e-01  3.32003

In [44]:
batch_size = 64
train_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames, batch_size=batch_size)

In [45]:
from tensorflow import keras

In [46]:
# 模型
model = keras.models.Sequential([
    keras.layers.Dense(32, activation='relu', input_shape=[8]),
    keras.layers.Dense(1)
])

In [47]:
model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [51]:
history = model.fit(train_set, 
                   validation_data=valid_set,
                   steps_per_epoch= len(x_train_scaled)//batch_size,
                   validation_steps = len(x_valid_scaled)//batch_size,
                   epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

KeyboardInterrupt: 