In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from keras import layers
from sklearn.model_selection import train_test_split

In [3]:
import pathlib

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')
dataframe = pd.read_csv(csv_file)

In [4]:
print(dataframe.shape)
dataframe.head()

(11537, 15)


Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


列	描述	特征类型	数据类型
Type	动物类型（狗、猫）	分类	字符串
Age	宠物年龄	数值	整数
Breed1	宠物的主要品种	分类	字符串
Color1	宠物的颜色 1	分类	字符串
Color2	宠物的颜色 2	分类	字符串
MaturitySize	成年个体大小	分类	字符串
FurLength	毛发长度	分类	字符串
Vaccinated	宠物已接种疫苗	分类	字符串
Sterilized	宠物已绝育	分类	字符串
Health	健康状况	分类	字符串
Fee	领养费	数值	整数
Description	关于此宠物的简介	文本	字符串
PhotoAmt	为该宠物上传的照片总数	数值	整数
AdoptionSpeed	领养速度	分类	整数

## 构造数据

In [5]:
# In the original dataset "4" indicates the pet was not adopted. 4表示未被领养
# np.where 像推导式
dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)

# Drop un-used columns.
dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])

## 划分数据集

In [6]:
# 5分之1的测试集，如果不设置默认是0.25
# 验证集是什么？
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


## 解包，zip()与dict()

In [7]:
## 数组解包
a = [(1, 'a'), (2, 'b'), (3, 'c')]
print("a解包", *a)

## 元组解包
b = ([1, 2], [3, 4], [5, 6])
print('b解包', *b)

# zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表。
print('zip迭代', list(zip([1,2,3], (4,5,6))))
print(list(zip(a)))
## zip迭代, 用迭代器，迭代a，b，然后每次迭代的东西组成一个元组
print(list(zip(a, b)))

## zip迭代，zip(*)，解包后迭代, 解包后可以理解成三个参数，等同于 zip((1, 'a'), (2, 'b'), (3, 'c'))
print(list(zip(*a))) 

## dict创建一个字典 (https://www.runoob.com/python/python-func-dict.html),参数有三种
## 其实如果是一个对象的话，会调用对象的 __dict__和，to_dict()进行转换成dict
print('dict:')
# 映射对象 也就是 a=b这种
print(dict(a='a', b='b', t='t'))
print(dict(zip(['one', 'two', 'three'], (1, 2, 3))))  # 这里的zip在进行两个遍历
# 可遍历对象，遍历对象中必须只有两个元素
print(dict([('one', 1), ('two', 2), ('three', 3)]))
# dict([('one', 1, 2), ('two', 2, 3), ('three', 3, 4)]) # dictionary update sequence element #0 has length 3; 2 is required
print(dict(list(zip(['x', 'y', 'z'], [1, 2, 3])))) #zip两个进行遍历，list转换成了列表，这里进行的是可遍历对象

a解包 (1, 'a') (2, 'b') (3, 'c')
b解包 [1, 2] [3, 4] [5, 6]
zip迭代 [(1, 4), (2, 5), (3, 6)]
[((1, 'a'),), ((2, 'b'),), ((3, 'c'),)]
[((1, 'a'), [1, 2]), ((2, 'b'), [3, 4]), ((3, 'c'), [5, 6])]
[(1, 2, 3), ('a', 'b', 'c')]
dict:
{'a': 'a', 'b': 'b', 't': 't'}
{'one': 1, 'two': 2, 'three': 3}
{'one': 1, 'two': 2, 'three': 3}
{'x': 1, 'y': 2, 'z': 3}


## 创建输入流水线

In [8]:
# 先测验一下数据输出
test_dataframe = dataframe.copy()
# 取出的是一个列
# debug后会发现，dataframe是一个表格，series是一个列，dataframe是一堆的series组成的
test_labels = test_dataframe.pop('target')

# dict的遍历
my_dict = {'a': 1, 'b': 2}
for key in my_dict:
    print(key)
    
# 会调用对象的to_dict()方法进行转换成dict，这是chargpt说的，此处用的并不是迭代特性
# print(dict(dataframe))

# from_tensor_slices可以去看，tf_data.ipynb, 在这里对每个可迭代的
# dataset = tf.data.Dataset.from_tensor_slices(({"a": [1, 2], "b": [3, 4]}, {"c": [5, 6]}))
# print(list(dataset.as_numpy_iterator()))

# 这个参数是元组，就是对每一项进行拆分，最后合成元组，拆分后每一项都这个结构：{a=1, b=2, c=3}, label[i]
ds = tf.data.Dataset.from_tensor_slices((dict(test_dataframe), test_labels))
print(list(ds)[:1])

print('------------------------------------------------------------------------')
# 3个为一组 {a = [1,2,3], b = [4,5,6], c = [7,8,9]}, [1,2,3]
ds = ds.batch(3)
print(list(ds)[:1])

print('------------------------------------------------------------------------')
# 做个测验，直接对test_dataframe进行拆分的结果（Failed to convert a NumPy array to a Tensor (Unsupported object type int)）
# ds1 = tf.data.Dataset.from_tensor_slices((test_dataframe, test_labels)) 

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

a
b
[({'Type': <tf.Tensor: shape=(), dtype=string, numpy=b'Cat'>, 'Age': <tf.Tensor: shape=(), dtype=int64, numpy=3>, 'Breed1': <tf.Tensor: shape=(), dtype=string, numpy=b'Tabby'>, 'Gender': <tf.Tensor: shape=(), dtype=string, numpy=b'Male'>, 'Color1': <tf.Tensor: shape=(), dtype=string, numpy=b'Black'>, 'Color2': <tf.Tensor: shape=(), dtype=string, numpy=b'White'>, 'MaturitySize': <tf.Tensor: shape=(), dtype=string, numpy=b'Small'>, 'FurLength': <tf.Tensor: shape=(), dtype=string, numpy=b'Short'>, 'Vaccinated': <tf.Tensor: shape=(), dtype=string, numpy=b'No'>, 'Sterilized': <tf.Tensor: shape=(), dtype=string, numpy=b'No'>, 'Health': <tf.Tensor: shape=(), dtype=string, numpy=b'Healthy'>, 'Fee': <tf.Tensor: shape=(), dtype=int64, numpy=100>, 'PhotoAmt': <tf.Tensor: shape=(), dtype=int64, numpy=1>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)]
------------------------------------------------------------------------
[({'Type': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'Cat', b'

## 理解输入流水线

In [9]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['Age'])
  print('A batch of targets:', label_batch )

Every feature: ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
A batch of ages: tf.Tensor([ 1  6  2 49  4], shape=(5,), dtype=int64)
A batch of targets: tf.Tensor([1 0 1 1 1], shape=(5,), dtype=int32)


## 演示几种特征列

In [10]:
# {a=([1,2,3], b=[4,5,6], c=[7,8,9]}, [10,11,12])
print(next(iter(train_ds)))
print("-----------------------------------------------------------")
# 这个[0]是取出元组的第一项，(a=[1,2,3], b=[4,5,6], c=[7,8,9]}
print(next(iter(train_ds))[0])
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]
print("-----------------------------------------------------------")
print(train_ds.element_spec)

({'Type': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Dog', b'Cat', b'Cat', b'Cat', b'Cat'], dtype=object)>, 'Age': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([11,  1, 24,  3,  5], dtype=int64)>, 'Breed1': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'Mixed Breed', b'American Shorthair', b'Bombay',
       b'Domestic Short Hair', b'Tabby'], dtype=object)>, 'Gender': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Male', b'Female', b'Male', b'Female', b'Male'], dtype=object)>, 'Color1': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Black', b'Black', b'Black', b'Black', b'Yellow'], dtype=object)>, 'Color2': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'White', b'Brown', b'No Color', b'Brown', b'White'], dtype=object)>, 'MaturitySize': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'Medium', b'Medium', b'Medium', b'Medium', b'Medium'],
      dtype=object)>, 'FurLength': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Short', b'Short', b'

In [11]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

#### numeric_column 与 bucketized_column

In [12]:
data = {'a': [15, 9, 17, 19, 21, 18, 25, 30],
   'b': [5.0, 6.4, 10.5, 13.6, 15.7, 19.9, 20.3 , 0.0]}
a = tf.feature_column.numeric_column('a')
b = tf.feature_column.numeric_column('b')
a_buckets = tf.feature_column.bucketized_column(a,
   boundaries=[10, 15, 20, 25, 30])

feature_layer1 = tf.keras.layers.DenseFeatures([a_buckets, b])
feature_layer2 = tf.keras.layers.DenseFeatures([a, b])
print(feature_layer1(data))    ## shape : 8*2
print(feature_layer2(data))    ## shape : 8*2

tf.Tensor(
[[ 0.   0.   1.   0.   0.   0.   5. ]
 [ 1.   0.   0.   0.   0.   0.   6.4]
 [ 0.   0.   1.   0.   0.   0.  10.5]
 [ 0.   0.   1.   0.   0.   0.  13.6]
 [ 0.   0.   0.   1.   0.   0.  15.7]
 [ 0.   0.   1.   0.   0.   0.  19.9]
 [ 0.   0.   0.   0.   1.   0.  20.3]
 [ 0.   0.   0.   0.   0.   1.   0. ]], shape=(8, 7), dtype=float32)
tf.Tensor(
[[15.   5. ]
 [ 9.   6.4]
 [17.  10.5]
 [19.  13.6]
 [21.  15.7]
 [18.  19.9]
 [25.  20.3]
 [30.   0. ]], shape=(8, 2), dtype=float32)


In [13]:
# 特征列定义
photo_count = feature_column.numeric_column('PhotoAmt')
# 根据特征列定义转换层
feature_layer = layers.DenseFeatures(photo_count)
# 将数据转化成能够输入模型的参数，tensor
print(feature_layer(example_batch).numpy())

[[2.]
 [5.]
 [3.]
 [1.]
 [1.]]


#### 分类列

In [14]:
animal_type = feature_column.categorical_column_with_vocabulary_list(
      'Type', ['Cat', 'Dog'])
# 将类别转化成one-hot编码
animal_type_one_hot = feature_column.indicator_column(animal_type)
feature_layer = layers.DenseFeatures(animal_type_one_hot)
print(feature_layer(example_batch).numpy())

[[1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


#### 嵌入列

In [15]:
# Notice the input to the embedding column is the categorical column
# we previously created
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'Breed1', dataframe.Breed1.unique())
# 每一个特征转化成一个8维的特征向量
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
feature_layer = layers.DenseFeatures(breed1_embedding)
# 将数据转化成能够输入模型的参数，tensor
print(feature_layer(example_batch).numpy())

[[ 0.10953157  0.08775736  0.13227281 -0.25318393 -0.02156519 -0.18340784
  -0.5847811  -0.3848457 ]
 [ 0.03675685 -0.03041603 -0.04251684  0.26250032  0.3934432   0.34596962
   0.06367955  0.26502386]
 [-0.28897867  0.15352938 -0.15458833 -0.03339379 -0.29553643 -0.6983391
  -0.5609413   0.11312811]
 [-0.28897867  0.15352938 -0.15458833 -0.03339379 -0.29553643 -0.6983391
  -0.5609413   0.11312811]
 [-0.28897867  0.15352938 -0.15458833 -0.03339379 -0.29553643 -0.6983391
  -0.5609413   0.11312811]]


#### 经过hash处理的特征列

In [18]:
print(dataframe.Breed1.unique())
# 将Bread1，映射到10个桶中
breed1_hashed = feature_column.categorical_column_with_hash_bucket(
      'Breed1', hash_bucket_size=10)
# 将编码转化成独热编码
bread1_one_hot = feature_column.indicator_column(breed1_hashed)
feature_layer = layers.DenseFeatures(bread1_one_hot)
print(feature_layer(example_batch).numpy())

['Tabby' 'Domestic Medium Hair' 'Mixed Breed' 'Domestic Short Hair'
 'Domestic Long Hair' 'Terrier' 'Persian' 'Rottweiler'
 'Jack Russell Terrier' 'Shih Tzu' 'Labrador Retriever' 'Silky Terrier'
 'Bombay' 'Husky' 'Schnauzer' 'Golden Retriever' 'Siberian Husky' 'Collie'
 'German Shepherd Dog' 'Siamese' 'Calico' 'American Staffordshire Terrier'
 'Turkish Van' 'Doberman Pinscher' 'Oriental Short Hair' 'Beagle'
 'Ragdoll' 'Cocker Spaniel' 'Poodle' 'Black Labrador Retriever' 'Bengal'
 'Shar Pei' 'Spitz' 'Birman' 'Belgian Shepherd Malinois'
 'American Shorthair' 'Belgian Shepherd Laekenois' '0'
 'Jack Russell Terrier (Parson Russell Terrier)' 'Shepherd' 'Corgi'
 'Pit Bull Terrier' 'Oriental Tabby' 'Miniature Pinscher' 'Manx' 'Boxer'
 'Dachshund' 'Chihuahua' 'Snowshoe' 'Rat Terrier' 'Tiger' 'Silver'
 'Maine Coon' 'German Pinscher' 'Russian Blue' 'Tuxedo' 'Lhasa Apso'
 'Pomeranian' 'Whippet' 'English Bulldog' 'Bull Terrier'
 'Yellow Labrador Retriever' 'Dalmatian'
 'West Highland White Terrier

In [21]:
# api的案例,10000个桶，最后转化成维度为16的embedding向量
keywords = tf.feature_column.categorical_column_with_hash_bucket("keywords",
10000)
keywords_embedded = tf.feature_column.embedding_column(keywords, 16)
columns = [keywords_embedded]
# 这个特征的tensor：shape = (3, 5)
features = {'keywords': tf.constant([['Tensorflow', 'Keras', 'RNN', 'LSTM',
'CNN'], ['LSTM', 'CNN', 'Tensorflow', 'Keras', 'RNN'], ['CNN', 'Tensorflow',
'LSTM', 'Keras', 'RNN']])}
input_layer = tf.keras.layers.DenseFeatures(columns)
# 这个特征：shap = (3, 16)
# 当您将 keywords 特征列转换为 keywords_embedded 嵌入特征列时，TensorFlow 会将每个原始的 5 维特征向量映射到一个 16 维的嵌入向量。
# 这是嵌入层的作用，它会学习如何将原始的稀疏特征表示（如单词或类别）转换为密集的低维嵌入表示，以便模型更好地理解和泛化
dense_tensor = input_layer(features)
print(dense_tensor.numpy())

[[ 0.15477292  0.09464832 -0.17725372  0.15257654  0.11707681  0.06741592
  -0.12350787 -0.0180659  -0.3225762   0.10277971 -0.00291276  0.0645194
  -0.01811451  0.06048112  0.07542967  0.06035279]
 [ 0.15477294  0.09464832 -0.17725371  0.15257654  0.11707679  0.06741593
  -0.12350788 -0.0180659  -0.32257617  0.10277971 -0.00291276  0.0645194
  -0.01811451  0.06048112  0.07542967  0.06035279]
 [ 0.15477294  0.09464832 -0.17725371  0.15257654  0.11707681  0.06741592
  -0.12350788 -0.0180659  -0.3225762   0.10277971 -0.00291276  0.0645194
  -0.01811451  0.06048112  0.07542967  0.06035279]]
