In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_california_housing #导入加尼福尼亚房价数据

In [2]:
house = fetch_california_housing()
house.data.shape 

(20640, 8)

这里可以看到，房价数据有20640行，8个特征，一共是20640*8个数据

In [3]:
x_full,y_full = house.data,house.target
n_samples = x_full.shape[0] #这里得到行数量
n_features = x_full.shape[1] #这里得到列数，同时也是特征数

In [4]:
rng = np.random.RandomState(42)

In [5]:
missing_rate = 0.15
n_missing_samples = int(np.floor(n_samples*n_features*missing_rate))
n_missing_samples #这里来确定缺失值的总数

24768

In [6]:
missing_samples = rng.randint(0,n_samples,n_missing_samples) #这里确定缺失值的行索引
missing_features = rng.randint(0,n_features,n_missing_samples) #这里确定缺失值的列索引


In [7]:
x_missing = x_full
y_missing = y_full

In [8]:
x_missing[missing_samples,missing_features]=np.nan #这里是制造缺失值

In [9]:
x_missing=pd.DataFrame(x_missing)
x_missing

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,,,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


下面开始使用随机森林回归来填补缺失值

In [10]:
x_missing_reg = x_missing.copy()

In [11]:
sortindex = np.argsort(x_missing_reg.isnull().sum(axis=0)).values #这里得到缺失值从小到大的特征的索引，看得出来第二个特征缺失值最少
sortindex

array([1, 2, 0, 7, 3, 6, 5, 4], dtype=int64)

In [12]:
imputer = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)

下面为具体的算法从for开始
因为我用的是加尼福尼亚数据，所以会跑很久，代码运行了2分钟左右，我服了
但是如果把n_jobs启用，速度就会快很多

In [13]:
for i in sortindex:
    #第一步，复制数据到df，从缺失值最少的那一列开始取出
    df = x_missing_reg
    fillc = df.iloc[:,i] #这里是取出了那一列
    df=pd.concat([df.iloc[:,df.columns!=i],pd.DataFrame(y_full)],axis=1) #将target和其余的列合并[:,]表示所有行，[:,df.columns!=i]代表不含i这一列
    
    df_0=imputer.fit_transform(df) #这里是将df中的缺失值全部填补为零，imputer的函数在上面
    
    y_train = fillc[fillc.notnull()] #将不是零的作为训练集的目标
    y_test = fillc[fillc.isnull()]
    x_train = df_0[y_train.index,:] #选出不是零的其余特征作为变量
    x_test = df_0[y_test.index,:]
    
    rfr = RandomForestRegressor(n_jobs=-1) #建立随机森林模型，这里全部用默认的参数
    rfr.fit(x_train,y_train)
    y_predict = rfr.predict(x_test)
    x_missing_reg.loc[x_missing_reg.iloc[:,i].isnull(),i]=y_predict #将填补预测的缺失值填入回表格中

In [14]:
x_missing_reg #这里看到缺失值填补完毕了

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.325200,41.00,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,6.174019,21.00,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.482085,52.00,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,4.536249,52.00,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.846200,52.00,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.560300,25.00,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.556800,18.00,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,2.409685,30.28,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.867200,18.00,5.329513,1.171920,741.0,2.123209,39.43,-121.32
