In [1]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import kruskalwallis
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.genmod.families import Binomial
from statsmodels.imputation import mice
from scipy import stats

## 对服从正态分布的数据线进行方差齐性检验，检验通过了再进行方差分析
def check_normality(data):
    #Anderson-Darling test 检验   (5 ≤ N ≤ 25)
    if data.shape[0] < 7:
        statistic1, critical_values1, significance_level1 = stats.anderson(data, dist='norm')
        if critical_values1[2] > 0.05:
            result='正态分布'
        else:
            result='非正态分布'

    #Shapiro-Wilk算法检验正态分布性（官方文档的说明是大于5000的样本时，p值可能不准） (7 ≤ N ≤ 2000)
    elif data.shape[0] <= 2000 and data.shape[0] >= 7:
        statistic = stats.shapiro(data)
        if statistic[1] > 0.05:
            result = '正态分布'
        else:
            result = '非正态分布'

    else:
    # （Kolmogorov-Smirnov test ，样本量大于2000时适用）（N ＞ 2000）
        statistic = stats.kstest(data, 'norm')
        name3 = 'Kolmogorov-Smirnov'
        if statistic[1] > 0.05:
            result = '正态分布'
        else:
            result = '非正态分布'

    return result


def f_test(df, exog, endog):
    df['R'] = np.where(df[endog].isnull(), 0, 1)
    args = [list(df[df['R']==0][exog].dropna()), list(df[df['R']==1][exog].dropna())]
    if len(set(df[exog].dropna())) > 5:
        # exog 是连续的情况  
        result = check_normality(df[exog].dropna())
        if result == '正态分布':
            W, p0 = stats.levene(*args)
            method_ = 't检验'
            if p0 < 0.05:
                s1, p1 = stats.f_oneway(*args)
            else:
                p1 = 0.05
        else:
            s1, p1 = kruskalwallis(*args)
            method_ = '非参检验'

        if p1 < 0.05:
#             print('%s检验显著，%s的缺失机制不是MCAR（完全随机缺失）'%(exog, endog))
            result = '缺失机制为：非完全随机缺失(非MCAR)'
        else:
#             print('通过%s检验不显著，%s的缺失机制是MCAR（完全随机缺失）'%(exog, endog))
            result = '缺失机制为：完全随机缺失(MCAR)'

    else: # 卡方检验真的可以吗？不如使用模型
        result, method_ = logit_model(df, exog, endog)
    return result, method_


def logit_model_inner(data, exog, endog):
    formula = "R ~ 1"
    model = glm(formula, data=data, family=Binomial()).fit()
    if model.pvalues.max() < 0.05:
        result = '缺失机制为：完全随机缺失(MCAR)'
    else:
        from statsmodels.imputation import mice
        imp = mice.MICEData(data)
        imp.set_imputer(endog, formula='1')
        imp.update_all(10)
        data[endog+'_f'] = imp.data[endog]
        data[endog+'_fs'] = data[endog+'_f'] ** 2
        formula = "R ~ %s + %s + 1"%(endog+'_fs', endog+'_f')
        model = glm(formula, data=data, family=Binomial()).fit()
        if model.pvalues.max() < 0.05:
            result = '缺失机制为：非随机缺失(NMAR)'
        else:
            formula = "R ~ %s + 1"%(endog+'_f')
            model = glm(formula, data=data, family=Binomial()).fit()
            if model.pvalues.max() < 0.05:
                result = '缺失机制为：非随机缺失(NMAR)'
            else:
                result = '无法根据%s判断%s的缺失机制'%(exog, endog)
    return result


def logit_model(df, exog, endog):    
    data = df[[exog, endog]]
    data['R'] = np.where(df[endog].isnull(), 0, 1)
    data[exog+'_sq'] = data[exog] ** 2
    try:  # 防止自变量与因变量完全分离的情况的出现
        formula = "R ~ %s + %s + 1"%(exog+'_sq', exog)
        model = glm(formula, data=data, family=Binomial()).fit()
        if model.pvalues.max() < 0.05:
            result = '缺失机制为：随机缺失(MAR)'
        else:
            result = logit_model_inner(data, exog, endog)
    except:
        result = logit_model_inner(data, exog, endog)
    method_ = 'Logit法'
    return result, method_


def judge_mechanism(df, exog, endog):
    data = df.copy()

    # 需要看看目标变量有没有缺失
    if data[[endog]].dropna().shape[0] == data.shape[0]:
        print('目标变量不含缺失值，不需要做缺失变量检验')
    
    elif data[[endog]].dropna().shape[0] == 0:
        print('目标变量全部缺失，不需要做缺失变量检验')
        
    else:
        # 先判断单调，若为单调缺失则不用做检验直接下结论
        data['R'] = np.where(data[endog].isnull(), 0, 1)
        if list(set(data['R'] == sorted(data['R'], reverse=True))) == [True] or list(set(data['R'] == sorted(data['R']))) == [True]:
            dd_ = pd.DataFrame([[endog, '-','单调缺失', '随机缺失(MAR)']], columns = ['目标变量', '辅助变量', '检验方法', '目标变量-缺失值机制判断'])
            dd = dd_

        # 否则做检验
        else:
            dd = []
            for i in range(len(exog)):
                # 必须使用完全数据作为辅助变量进行判断
                if data[['R', exog[i]]].dropna().shape[0] != data.shape[0]:
                    print('%s为不完备数据，不能作为辅助变量对目标缺失变量进行检验'%exog[i])
                elif len(set(data[exog[i]])) == 1:
                    print('%s为常量，不能作为辅助变量对目标缺失变量进行检验'%exog[i])
                else:
                    result, method_ = f_test(data, exog[i], endog)
                    if result == '缺失机制为：非完全随机缺失(非MCAR)':
                        result, method_ = logit_model(data, exog[i], endog)
                        if result == '无法根据%s判断%s的缺失机制'%(exog[i], endog):
                            result = '缺失机制为：非完全随机缺失(非MCAR)'
                    elif result == '无法根据%s判断%s的缺失机制'%(exog[i], endog):
                        result = '缺失机制为：非完全随机缺失(非MCAR)'
                    dd_ = pd.DataFrame([[endog, exog[i], method_, result.split('：')[-1]]], columns = ['目标变量', '辅助变量', '检验方法', '目标变量-缺失值机制判断'])
                    dd.append(dd_) 
            dd = pd.concat(dd, axis=0)
    return dd


def judge_na(df_replace, exog, endog):
    df_mechanism = []
    for i in range(len(endog)):
        if str(df_replace[endog[i]].dtype) == 'object':
            print('变量为字符型，不能进行缺失值机制判断')
        else:
            df_mechanism_ = judge_mechanism(df_replace, exog, endog[i])
            df_mechanism.append(df_mechanism_)
    df_mechanism = pd.concat(df_mechanism, axis=0)
    return df_mechanism

## 读取数据

1) csv 数据读取格式：

pd.read_csv("xxx.csv")

pd.read_csv("xxx.csv", encoding='gb18030')

pd.read_csv("xxx.csv", encoding='utf-8')

如有需要，添加参数 engine='python'

2) xlsx读取格式：

pd.read_excel("xxx.xlsx")

In [2]:
df = pd.read_excel("test_data2.xlsx")
df

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,,1.0,0,0.0,0.0,,66.0,67.0,1.0,1.0,2.0,,,1,,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,,67.0,1.0,,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,,1.0,0,0.0,0.0,103.0,,85.0,0.0,1.0,3.0,,,1,,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,,,,,1,,,C
5,6,,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,,,58.0,80.0,12.0,,,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,,,,43.0,1.0,1,,0.0,D
8,9,9.0,1,,0.0,0,0.0,1.0,,,,,,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,,,,,,,,,,,1,否,1.0,B


## 直接调用mian函数
### 参数说明
**df_replace**: 数据集 (pd.DataFrame)

**exog**: 不含缺失值的变量，用于辅助判断(list)

**endog**: 需要判断缺失机制的变量 (list)

In [3]:
df_replace = df.copy()
exog = ['序号', 'X2', 'X5']
endog = ['X1', 'X3', 'Y', 'Time']

In [4]:
judge_na(df_replace, exog, endog)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,目标变量,辅助变量,检验方法,目标变量-缺失值机制判断
0,X1,序号,t检验,完全随机缺失(MCAR)
0,X1,X2,Logit法,完全随机缺失(MCAR)
0,X1,X5,Logit法,完全随机缺失(MCAR)
0,X3,序号,t检验,完全随机缺失(MCAR)
0,X3,X2,Logit法,非完全随机缺失(非MCAR)
0,X3,X5,Logit法,非完全随机缺失(非MCAR)
0,Y,序号,t检验,完全随机缺失(MCAR)
0,Y,X2,Logit法,完全随机缺失(MCAR)
0,Y,X5,Logit法,完全随机缺失(MCAR)
0,Time,序号,t检验,完全随机缺失(MCAR)


## 缺失数据填充

In [5]:
# 把df数据copy出来，下面的一切处理均在df_fillna数据集里面
df_fillna = df.copy()

In [6]:
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,,1.0,0,0.0,0.0,,66.0,67.0,1.0,1.0,2.0,,,1,,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,,67.0,1.0,,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,,1.0,0,0.0,0.0,103.0,,85.0,0.0,1.0,3.0,,,1,,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,,,,,1,,,C
5,6,,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,,,58.0,80.0,12.0,,,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,,,,43.0,1.0,1,,0.0,D
8,9,9.0,1,,0.0,0,0.0,1.0,,,,,,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,,,,,,,,,,,1,否,1.0,B


#### 均值填充

In [7]:
# 单个的情况
df_fillna['X1'] = df_fillna['X1'].fillna(df_fillna['X1'].mean())
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,,1.0,0,0.0,0.0,,66.0,67.0,1.0,1.0,2.0,,,1,,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,,67.0,1.0,,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,,1.0,0,0.0,0.0,103.0,,85.0,0.0,1.0,3.0,,,1,,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,,,,,1,,,C
5,6,10.5,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,,,58.0,80.0,12.0,,,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,,,,43.0,1.0,1,,0.0,D
8,9,9.0,1,,0.0,0,0.0,1.0,,,,,,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,,,,,,,,,,,1,否,1.0,B


In [8]:
# 一次填充多个, 只需要将变量名放入subsets参数中
subsets = ['Time', 'Y1']
for i in range(len(subsets)):
    df_fillna[subsets[i]] = df_fillna[subsets[i]].fillna(df_fillna[subsets[i]].mean())
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,,1.0,0,0.0,0.0,61.0625,66.0,67.0,1.0,1.0,2.0,,,1,,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,77.166667,67.0,1.0,,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,,1.0,0,0.0,0.0,103.0,77.166667,85.0,0.0,1.0,3.0,,,1,,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,,,,,1,,,C
5,6,10.5,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,,,58.0,80.0,12.0,,,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,,,,43.0,1.0,1,,0.0,D
8,9,9.0,1,,0.0,0,0.0,1.0,61.0625,77.166667,,,,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,,,61.0625,77.166667,,,,,,,1,否,1.0,B


####  中位数填充

In [9]:
subsets = ['Y2', 'Y3']
for i in range(len(subsets)):
    df_fillna[subsets[i]] = df_fillna[subsets[i]].fillna(df_fillna[subsets[i]].median())
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,,1.0,0,0.0,0.0,61.0625,66.0,67.0,1.0,1.0,2.0,,,1,,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,77.166667,67.0,1.0,,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,,1.0,0,0.0,0.0,103.0,77.166667,85.0,0.0,1.0,3.0,,,1,,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,,,,,1,,,C
5,6,10.5,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,,,58.0,80.0,12.0,0.5,,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,0.5,,,43.0,1.0,1,,0.0,D
8,9,9.0,1,,0.0,0,0.0,1.0,61.0625,77.166667,67.0,0.5,,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,,,61.0625,77.166667,67.0,0.5,,,,,1,否,1.0,B


#### 众数填充

In [10]:
subsets = ['X3', 'X4', 'X6', 'Y']
for i in range(len(subsets)):
    df_fillna[subsets[i]] = df_fillna[subsets[i]].fillna(df_fillna[subsets[i]].mode().iloc[0]) # 可能有多个众数，默认取第一个
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,0.0,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,0.0,1.0,0,0.0,0.0,61.0625,66.0,67.0,1.0,1.0,2.0,,,1,,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,77.166667,67.0,1.0,,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,0.0,1.0,0,0.0,0.0,103.0,77.166667,85.0,0.0,1.0,3.0,,,1,,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,,,,,1,,,C
5,6,10.5,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,0.0,0.0,58.0,80.0,12.0,0.5,,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,0.5,,,43.0,1.0,1,,0.0,D
8,9,9.0,1,0.0,0.0,0,0.0,1.0,61.0625,77.166667,67.0,0.5,,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,0.0,0.0,61.0625,77.166667,67.0,0.5,,,,,1,否,1.0,B


#### 上一行的值填充

In [11]:
subsets = ['Y4', 'Z1']
for i in range(len(subsets)):
    df_fillna[subsets[i]] = df_fillna[subsets[i]].fillna(method='ffill') # 可能有多个众数，默认取第一个
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,0.0,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,0.0,1.0,0,0.0,0.0,61.0625,66.0,67.0,1.0,1.0,2.0,,,1,是,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,77.166667,67.0,1.0,1.0,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,0.0,1.0,0,0.0,0.0,103.0,77.166667,85.0,0.0,1.0,3.0,,,1,否,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,1.0,,,,1,否,,C
5,6,10.5,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,1.0,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,0.0,0.0,58.0,80.0,12.0,0.5,1.0,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,0.5,1.0,,43.0,1.0,1,否,0.0,D
8,9,9.0,1,0.0,0.0,0,0.0,1.0,61.0625,77.166667,67.0,0.5,1.0,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,0.0,0.0,61.0625,77.166667,67.0,0.5,1.0,,,,1,否,1.0,B


#### 下一行的值填充

In [12]:
subsets = ['Y5', 'Y6']
for i in range(len(subsets)):
    df_fillna[subsets[i]] = df_fillna[subsets[i]].fillna(method='bfill') # 可能有多个众数，默认取第一个
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,0.0,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,45.0,,1,是,,A
1,2,2.0,1,0.0,1.0,0,0.0,0.0,61.0625,66.0,67.0,1.0,1.0,2.0,45.0,,1,是,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,77.166667,67.0,1.0,1.0,3.0,45.0,4.0,1,否,1.0,D
3,4,3.0,1,0.0,1.0,0,0.0,0.0,103.0,77.166667,85.0,0.0,1.0,3.0,55.0,,1,否,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,1.0,9.0,55.0,,1,否,,C
5,6,10.5,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,1.0,9.0,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,0.0,0.0,58.0,80.0,12.0,0.5,1.0,9.0,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,0.5,1.0,9.0,43.0,1.0,1,否,0.0,D
8,9,9.0,1,0.0,0.0,0,0.0,1.0,61.0625,77.166667,67.0,0.5,1.0,9.0,89.0,,1,是,,A
9,10,10.0,0,0.0,1.0,0,0.0,0.0,61.0625,77.166667,67.0,0.5,1.0,11.0,89.0,,1,否,1.0,B


#### 自定义填充

In [13]:
df_fillna['Y7'] = df_fillna['Y7'].fillna(999)
df_fillna['Z2'] = df_fillna['Z2'].fillna(1)
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,0.0,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,45.0,999.0,1,是,1.0,A
1,2,2.0,1,0.0,1.0,0,0.0,0.0,61.0625,66.0,67.0,1.0,1.0,2.0,45.0,999.0,1,是,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,77.166667,67.0,1.0,1.0,3.0,45.0,4.0,1,否,1.0,D
3,4,3.0,1,0.0,1.0,0,0.0,0.0,103.0,77.166667,85.0,0.0,1.0,3.0,55.0,999.0,1,否,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,1.0,9.0,55.0,999.0,1,否,1.0,C
5,6,10.5,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,1.0,9.0,55.0,3.0,1,是,1.0,B
6,7,7.0,0,1.0,1.0,0,0.0,0.0,58.0,80.0,12.0,0.5,1.0,9.0,64.0,89.0,1,否,1.0,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,0.5,1.0,9.0,43.0,1.0,1,否,0.0,D
8,9,9.0,1,0.0,0.0,0,0.0,1.0,61.0625,77.166667,67.0,0.5,1.0,9.0,89.0,999.0,1,是,1.0,A
9,10,10.0,0,0.0,1.0,0,0.0,0.0,61.0625,77.166667,67.0,0.5,1.0,11.0,89.0,999.0,1,否,1.0,B


### 多重插补（建议使用R包操作）

In [14]:
df_fillna = df.copy()
df_fillna

Unnamed: 0,序号,X1,X2,X3,X4,X5,X6,Y,Time,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Z1,Z2,Z3
0,1,1.0,1,,1.0,1,0.0,0.0,-52.0,67.0,67.0,1.0,1.0,1.0,,,1,是,,A
1,2,2.0,1,,1.0,0,0.0,0.0,,66.0,67.0,1.0,1.0,2.0,,,1,,1.0,B
2,3,3.0,2,1.0,1.0,0,0.0,0.0,35.0,,67.0,1.0,,,45.0,4.0,1,否,1.0,D
3,4,3.0,1,,1.0,0,0.0,0.0,103.0,,85.0,0.0,1.0,3.0,,,1,,1.0,A
4,5,5.0,0,0.0,1.0,1,1.0,0.0,7.0,87.0,82.0,0.0,,,,,1,,,C
5,6,,0,1.0,1.0,0,1.0,0.0,60.0,76.0,81.0,0.0,,,55.0,3.0,1,是,,B
6,7,7.0,0,1.0,1.0,0,,,58.0,80.0,12.0,,,,64.0,89.0,1,否,,D
7,8,8.0,0,1.0,1.0,1,0.0,0.0,29.0,87.0,19.0,,,,43.0,1.0,1,,0.0,D
8,9,9.0,1,,0.0,0,0.0,1.0,,,,,,9.0,,,1,是,,A
9,10,10.0,0,0.0,1.0,0,,,,,,,,,,,1,否,1.0,B


In [15]:
def mice_(df, exog, endog, k):
    data = df[exog+[endog]]
    imp = mice.MICEData(data)
    # 指定插补模型 （除endog外的变量使用OLS插补，endog使用指定模型插补）
    # imp.set_imputer(endog, formula = "{}+1".format(' + '.join(exog)))
    imp.update_all(k) # k次插补
    return imp.data

In [16]:
# 自变量
exog =['X1', 'X2', 'X3', 'X4']
# 因变量
endog = 'Time'
k = 5
mice_(df_fillna, exog, endog, k)

Unnamed: 0,X1,X2,X3,X4,Time
0,1.0,1,1.0,1.0,-52.0
1,2.0,1,0.0,1.0,82.0
2,3.0,2,1.0,1.0,35.0
3,3.0,1,1.0,1.0,103.0
4,5.0,0,0.0,1.0,7.0
5,16.0,0,1.0,1.0,60.0
6,7.0,0,1.0,1.0,58.0
7,8.0,0,1.0,1.0,29.0
8,9.0,1,1.0,0.0,66.0
9,10.0,0,0.0,1.0,101.0
