In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
DATA_PATH = './data/housing.csv'

In [3]:
df_data = pd.read_csv(DATA_PATH)
df_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
from splitter import split_data

df_train, df_test = split_data(df_data)

In [5]:
df_train_x = df_train.drop('median_house_value', axis=1)
df_train_y = df_train['median_house_value'].copy()

In [6]:
df_train_x.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN


## 检查缺失值

In [7]:
df_missing = df_train_x[df_train_x.isnull().any(axis=1)]
df_missing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
4629,-118.3,34.07,18.0,3759.0,,3296.0,1462.0,2.2708,<1H OCEAN
6068,-117.86,34.01,16.0,4632.0,,3038.0,727.0,5.1762,<1H OCEAN
17923,-121.97,37.35,30.0,1955.0,,999.0,386.0,4.6328,<1H OCEAN
13656,-117.3,34.05,6.0,2155.0,,1039.0,391.0,1.6675,INLAND
19252,-122.79,38.48,7.0,6837.0,,3468.0,1405.0,3.1662,<1H OCEAN


## 处理数值特征的缺失值

In [8]:
# df_missing.dropna(subset=['total_bedrooms'])

In [9]:
# df_missing.drop('total_bedrooms', axis=1)

In [10]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

# 移除非数值特征
df_train_num = df_train_x.drop('ocean_proximity', axis=1)
imputer.fit(df_train_num)

print(imputer.statistics_)

[-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]


In [11]:
# 确认上述 imputer 使用了正确的中位数
df_train_x.median()

longitude             -118.5100
latitude                34.2600
housing_median_age      29.0000
total_rooms           2119.5000
total_bedrooms         433.0000
population            1164.0000
households             408.0000
median_income            3.5409
dtype: float64

In [12]:
X = imputer.transform(df_train_num)
df_train_num_filled = pd.DataFrame(X, columns=df_train_num.columns, index=df_train_num.index)

In [13]:
# 检查缺失值填充
df_train_num_filled.loc[df_missing.index.values].head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
4629,-118.3,34.07,18.0,3759.0,433.0,3296.0,1462.0,2.2708
6068,-117.86,34.01,16.0,4632.0,433.0,3038.0,727.0,5.1762
17923,-121.97,37.35,30.0,1955.0,433.0,999.0,386.0,4.6328
13656,-117.3,34.05,6.0,2155.0,433.0,1039.0,391.0,1.6675
19252,-122.79,38.48,7.0,6837.0,433.0,3468.0,1405.0,3.1662


## 处理分类特征

In [14]:
df_train_cat = df_train_x[['ocean_proximity']]
df_train_cat.head()

Unnamed: 0,ocean_proximity
17606,<1H OCEAN
18632,<1H OCEAN
14650,NEAR OCEAN
3230,INLAND
3555,<1H OCEAN


### 序数编码

In [15]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
ordinal = ordinal_encoder.fit_transform(df_train_cat)

print(ordinal_encoder.categories_)
print(ordinal[:5])

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)]
[[0.]
 [0.]
 [4.]
 [1.]
 [0.]]


### 独热编码

In [16]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse=True)
one_hot = one_hot_encoder.fit_transform(df_train_cat)

print(one_hot_encoder.categories_)
print(one_hot[0:4].toarray())

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)]
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]]


## 添加额外特征

In [17]:
from sklearn.preprocessing import FunctionTransformer

def add_extra_features(X, columns, add_bedrooms_per_room=True):
    rooms_ix, bedrooms_ix, population_ix, household_ix = \
        [columns.index(col) for col in ('total_rooms', 'total_bedrooms', 'population', 'households')]
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = FunctionTransformer(add_extra_features, validate=False,
                                 kw_args={'columns': list(df_train_x.columns), 'add_bedrooms_per_room': False})
train_x_extra = attr_adder.fit_transform(df_train_x.values)

In [18]:
df_train_x_extra = pd.DataFrame(
    train_x_extra,
    columns=list(df_train_x.columns) + ['rooms_per_household', 'population_per_household'],
    index=df_train_x.index)
df_train_x_extra.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
17606,-121.89,37.29,38,1568,351,710,339,2.7042,<1H OCEAN,4.62537,2.0944
18632,-121.93,37.05,14,679,108,306,113,6.4214,<1H OCEAN,6.00885,2.70796
14650,-117.2,32.77,31,1952,471,936,462,2.8621,NEAR OCEAN,4.22511,2.02597
3230,-119.61,36.31,25,1847,371,1460,353,1.8839,INLAND,5.23229,4.13598
3555,-118.59,34.23,17,6592,1525,4459,1463,3.0347,<1H OCEAN,4.50581,3.04785


### 构建流水线

#### 数值特征流水线

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False, kw_args={'columns': list(df_train_num.columns)})),
        ('std_scaler', StandardScaler()),
    ])

train_num_processed = num_pipeline.fit_transform(df_train_num)

In [20]:
train_num_processed[0]

array([-1.15604281,  0.77194962,  0.74333089, -0.49323393, -0.44543821,
       -0.63621141, -0.42069842, -0.61493744, -0.31205452, -0.08649871,
        0.15531753])

#### 整合分类特征流水线

In [21]:
from sklearn.compose import ColumnTransformer

num_attribs = list(df_train_x)
num_attribs.remove('ocean_proximity')
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

In [22]:
train_processed = full_pipeline.fit_transform(df_train_x)

In [23]:
train_processed[0]

array([-1.15604281,  0.77194962,  0.74333089, -0.49323393, -0.44543821,
       -0.63621141, -0.42069842, -0.61493744, -0.31205452, -0.08649871,
        0.15531753,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ])

In [24]:
print(train_processed.shape)

(16512, 16)
