In [4]:
import os
import pandas as pd

## 读取数据集

In [3]:
os.makedirs(os.path.join("..", 'data'), exist_ok=True)
data_file = os.path.join("..", 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [6]:
data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


## 处理缺失值

In [7]:
# 典型的方法包括 插值法 和 删除法

In [24]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,


In [25]:
inputs["NumRooms"] = inputs["NumRooms"].fillna(inputs["NumRooms"].mean())
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [27]:
inputs = pd.get_dummies(inputs, dummy_na = True)
print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       3.0        True      False
1       2.0       False       True
2       4.0       False       True
3       3.0       False       True


In [28]:
inputs["Alley_Pave"] = inputs["Alley_Pave"].astype(int)
inputs["Alley_nan"] = inputs["Alley_nan"].astype(int)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


## 转换为张量格式

In [29]:
import tensorflow as tf

2024-04-03 16:03:15.697768: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-03 16:03:15.773572: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.


In [30]:
X = tf.constant(inputs.to_numpy(dtype=float))
y = tf.constant(outputs.to_numpy(dtype=float))
X, y

(<tf.Tensor: shape=(4, 3), dtype=float64, numpy=
 array([[3., 1., 0.],
        [2., 0., 1.],
        [4., 0., 1.],
        [3., 0., 1.]])>,
 <tf.Tensor: shape=(4,), dtype=float64, numpy=array([127500., 106000., 178100., 140000.])>)

## 练习

In [31]:
# 删除缺失值最多的列

In [33]:
data = data.drop(data.isna().sum().idxmax(), axis=1)
data

Unnamed: 0,NumRooms,Price
0,,127500
1,2.0,106000
2,4.0,178100
3,,140000


In [34]:
# 把预处理后的数据集转换为张量

In [38]:
tf.constant(data.values, dtype=tf.float32)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[      nan, 1.275e+05],
       [2.000e+00, 1.060e+05],
       [4.000e+00, 1.781e+05],
       [      nan, 1.400e+05]], dtype=float32)>

In [39]:
type(data.values)

numpy.ndarray