In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [3]:
import pathlib

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')
dataframe = pd.read_csv(csv_file)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip


In [4]:
print(dataframe.shape)
dataframe.head()

(11537, 15)


Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


列	描述	特征类型	数据类型
Type	动物类型（狗、猫）	分类	字符串
Age	宠物年龄	数值	整数
Breed1	宠物的主要品种	分类	字符串
Color1	宠物的颜色 1	分类	字符串
Color2	宠物的颜色 2	分类	字符串
MaturitySize	成年个体大小	分类	字符串
FurLength	毛发长度	分类	字符串
Vaccinated	宠物已接种疫苗	分类	字符串
Sterilized	宠物已绝育	分类	字符串
Health	健康状况	分类	字符串
Fee	领养费	数值	整数
Description	关于此宠物的简介	文本	字符串
PhotoAmt	为该宠物上传的照片总数	数值	整数
AdoptionSpeed	领养速度	分类	整数

## 构造数据

In [13]:
# In the original dataset "4" indicates the pet was not adopted. 4表示未被领养
# np.where 像推导式
dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)

# Drop un-used columns.
dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])

## 划分数据集

In [14]:
# 5分之1的测试集，如果不设置默认是0.25
# 验证集是什么？
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


## 解包，zip()与dict()

In [11]:
## 数组解包
a = [(1, 'a'), (2, 'b'), (3, 'c')]
print(*a)

## 元组解包
b = ([1, 2], [3, 4], [5, 6])
print(*b)

## zip迭代, 用迭代器，迭代a，b，然后每次迭代的东西组成一个元组
print(list(zip(a, b)))

## zip迭代，zip(*)，解包后迭代, 解包后可以理解成三个参数，等同于 zip((1, 'a'), (2, 'b'), (3, 'c'))
print(list(zip(*a))) 

## dict创建一个字典 (https://www.runoob.com/python/python-func-dict.html),参数有三种
## 其实如果是一个对象的话，会调用对象的 __dict__和，to_dict()进行转换成dict
print('dict:')
# 映射对象 也就是 a=b这种
print(dict(a='a', b='b', t='t'))
print(dict(zip(['one', 'two', 'three'], (1, 2, 3))))  # 这里的zip在进行两个遍历
# 可遍历对象，遍历对象中必须只有两个元素
print(dict([('one', 1), ('two', 2), ('three', 3)]))
# dict([('one', 1, 2), ('two', 2, 3), ('three', 3, 4)]) # dictionary update sequence element #0 has length 3; 2 is required
print(dict(list(zip(['x', 'y', 'z'], [1, 2, 3])))) #zip两个进行遍历，list转换成了列表，这里进行的是可遍历对象

(1, 'a') (2, 'b') (3, 'c')
[1, 2] [3, 4] [5, 6]
[((1, 'a'), [1, 2]), ((2, 'b'), [3, 4]), ((3, 'c'), [5, 6])]
[(1, 2, 3), ('a', 'b', 'c')]
dict:
{'a': 'a', 'b': 'b', 't': 't'}
{'one': 1, 'two': 2, 'three': 3}
{'one': 1, 'two': 2, 'three': 3}
{'x': 1, 'y': 2, 'z': 3}


## 创建输入流水线

In [28]:
# 先测验一下数据输出
test_dataframe = dataframe.copy()
# 取出的是一个列
# debug后会发现，dataframe是一个表格，series是一个列，dataframe是一堆的series组成的
test_labels = test_dataframe.pop('target')

# dict的遍历
my_dict = {'a': 1, 'b': 2}
for key in my_dict:
    print(key)
    
# 会调用对象的to_dict()方法进行转换成dict，这是chargpt说的，此处用的并不是迭代特性
# print(dict(dataframe))

# from_tensor_slices可以去看，tf_data.ipynb, 在这里对每个可迭代的
# dataset = tf.data.Dataset.from_tensor_slices(({"a": [1, 2], "b": [3, 4]}, {"c": [5, 6]}))
# print(list(dataset.as_numpy_iterator()))

# 这个参数是元组，就是对每一项进行拆分，最后合成元组，拆分后每一项都这个结构：{a=1, b=2, c=3}, label[i]
ds = tf.data.Dataset.from_tensor_slices((dict(test_dataframe), test_labels))
print(list(ds)[:1])

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds


a
b
[({'Type': <tf.Tensor: shape=(), dtype=string, numpy=b'Cat'>, 'Age': <tf.Tensor: shape=(), dtype=int64, numpy=3>, 'Breed1': <tf.Tensor: shape=(), dtype=string, numpy=b'Tabby'>, 'Gender': <tf.Tensor: shape=(), dtype=string, numpy=b'Male'>, 'Color1': <tf.Tensor: shape=(), dtype=string, numpy=b'Black'>, 'Color2': <tf.Tensor: shape=(), dtype=string, numpy=b'White'>, 'MaturitySize': <tf.Tensor: shape=(), dtype=string, numpy=b'Small'>, 'FurLength': <tf.Tensor: shape=(), dtype=string, numpy=b'Short'>, 'Vaccinated': <tf.Tensor: shape=(), dtype=string, numpy=b'No'>, 'Sterilized': <tf.Tensor: shape=(), dtype=string, numpy=b'No'>, 'Health': <tf.Tensor: shape=(), dtype=string, numpy=b'Healthy'>, 'Fee': <tf.Tensor: shape=(), dtype=int64, numpy=100>, 'PhotoAmt': <tf.Tensor: shape=(), dtype=int64, numpy=1>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)]
