In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
train_csv = "../input/house-prices-advanced-regression-techniques/train.csv"
test_csv = "../input/house-prices-advanced-regression-techniques/test.csv"
data_description = "../input/house-prices-advanced-regression-techniques/data_description.txt"  # 数据描述
sample_submission = "../input/house-prices-advanced-regression-techniques/sample_submission.csv"  # 样品提交
target = "SalePrice"

In [3]:
# with open(data_description, 'r') as f:
#     print(f.read())

In [4]:
pd.read_csv(sample_submission).head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [5]:
train_data = pd.read_csv(train_csv)
test_data = pd.read_csv(test_csv)
results = {}

In [6]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

处理流程
- 数据拆分：拆分成数值数据和object数据
- object数据格式化：将object型的数据改为数值标签
- 缺失值填补：
    - object: 众数
    - 数值型：均值
- 模型使用
    - 先将数据分为有缺失的和没缺失的
    - 再用没缺失的去训练，预测有缺失的值

In [10]:
X = train_data.drop(target, axis=1)
y = train_data[target]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=123)

In [11]:
def data_split(df: pd.DataFrame):
    # obj_columns = df.dtypes[df.dtypes=="object"].index
    obj_columns = df.dtypes[(df.dtypes=="object") & (df.nunique() < 10)  & (df.count() > df.shape[0] * 0.2)].index
    num_columns = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
    # df_num = df.drop(obj_columns, axis=1)
    # df_obj = df[obj_columns]
    # df_num = df[num_columns]
    # print("number columns: \n%s" % num_columns)
    # print("object columns: \n%s" % obj_columns)
    # return df_num, df_obj
    return obj_columns, num_columns


obj_columns, num_columns = data_split(train_X)
num_train_X = train_X[num_columns]
obj_train_X = train_X[obj_columns]
num_test_X = test_X[num_columns]
obj_test_X = test_X[obj_columns]

In [12]:
cat_imputer = SimpleImputer(strategy="mean")
imputer_train_X_num = pd.DataFrame(cat_imputer.fit_transform(num_train_X), columns=num_columns)
imputer_test_X_num = pd.DataFrame(cat_imputer.fit_transform(num_test_X), columns=num_columns)

cat_imputer = SimpleImputer(strategy="most_frequent")
imputer_train_X_obj = pd.DataFrame(cat_imputer.fit_transform(obj_train_X), columns=obj_columns)
imputer_test_X_obj = pd.DataFrame(cat_imputer.fit_transform(obj_test_X), columns=obj_columns)

label_encoder = LabelEncoder()
for column in imputer_train_X_obj.columns:
    imputer_train_X_obj[column] = label_encoder.fit_transform(imputer_train_X_obj[column].astype(str))
    
    imputer_test_X_obj[column] = imputer_test_X_obj[column].map(lambda s: '<unknown>' if s not in label_encoder.classes_ else s)
    label_encoder.classes_ = np.append(label_encoder.classes_, '<unknown>')
    
    imputer_test_X_obj[column] = label_encoder.transform(imputer_test_X_obj[column].astype(str))

In [13]:
imputer_train_X = pd.concat([imputer_train_X_num, imputer_train_X_obj], axis=1)
imputer_test_X = pd.concat([imputer_test_X_num, imputer_test_X_obj], axis=1)

In [14]:
rfr = RandomForestRegressor()
rfr.fit(imputer_train_X, train_y)
pred_y = rfr.predict(imputer_test_X)
mae1 = mean_absolute_error(test_y, pred_y)
mae1

17060.15941780822

### 使用测试数据生成提交结果

In [15]:
def data_split(df: pd.DataFrame):
    # obj_columns = df.dtypes[df.dtypes=="object"].index
    obj_columns = df.dtypes[(df.dtypes=="object") & (df.nunique() < 10)  & (df.count() > df.shape[0] * 0.2)].index
    num_columns = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
    # df_num = df.drop(obj_columns, axis=1)
    df_obj = df[obj_columns]
    df_num = df[num_columns]
    # print("number columns: \n%s" % num_columns)
    # print("object columns: \n%s" % obj_columns)
    # return df_num, df_obj
    return obj_columns, num_columns


X = train_data.drop(target, axis=1)
y = train_data[target]
obj_columns, num_columns = data_split(X)
num_train_X = X[num_columns]
obj_train_X = X[obj_columns]
num_test_X = test_data[num_columns]
obj_test_X = test_data[obj_columns]

cat_imputer = SimpleImputer(strategy="mean")
imputer_train_X_num = pd.DataFrame(cat_imputer.fit_transform(num_train_X), columns=num_columns)
imputer_test_X_num = pd.DataFrame(cat_imputer.fit_transform(num_test_X), columns=num_columns)

cat_imputer = SimpleImputer(strategy="most_frequent")
imputer_train_X_obj = pd.DataFrame(cat_imputer.fit_transform(obj_train_X), columns=obj_columns)
imputer_test_X_obj = pd.DataFrame(cat_imputer.fit_transform(obj_test_X), columns=obj_columns)

label_encoder = LabelEncoder()
for column in imputer_train_X_obj.columns:
    imputer_train_X_obj[column] = label_encoder.fit_transform(imputer_train_X_obj[column].astype(str))
    
    imputer_test_X_obj[column] = imputer_test_X_obj[column].map(lambda s: '<unknown>' if s not in label_encoder.classes_ else s)
    label_encoder.classes_ = np.append(label_encoder.classes_, '<unknown>')
    
    imputer_test_X_obj[column] = label_encoder.transform(imputer_test_X_obj[column].astype(str))
    
imputer_train_X = pd.concat([imputer_train_X_num, imputer_train_X_obj], axis=1)
imputer_test_X = pd.concat([imputer_test_X_num, imputer_test_X_obj], axis=1)

rfr = RandomForestRegressor()
rfr.fit(imputer_train_X, y)
pred_y = rfr.predict(imputer_test_X)

In [16]:
output1 = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": pred_y,
})
# output.to_csv('submission.csv', index=False)
output1.head()

Unnamed: 0,Id,SalePrice
0,1461,125297.5
1,1462,152780.25
2,1463,176628.76
3,1464,181377.52
4,1465,199018.68


In [17]:
results["method_1"] = {
    "mae": mae1,
    "output": output1
}

### 尝试用模型对缺失值填补

处理流程
- 数据拆分：拆分成数值数据和object数据
- object数据格式化：将object型的数据改为数值标签
- 缺失值填补：
    - object
        - 对于数据量较小的object，添加一个新标签表示空
        - 对于其他的用随机森林模型的分类模型预测
    - 数值型（目前没有数据量较小的情况）
        - 用随机森林模型的回归模型预测
- 模型使用
    - 先将数据分为有缺失的和没缺失的
    - 再用没缺失的去训练，预测有缺失的值

In [18]:
X = train_data.drop(target, axis=1)
y = train_data[target]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=123)

In [19]:
def data_split(df: pd.DataFrame):
    # obj_columns = df.dtypes[df.dtypes=="object"].index
    obj_columns = df.dtypes[(df.dtypes=="object") & (df.nunique() < 10)  & (df.count() > df.shape[0] * 0.2)].index
    num_columns = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
    # print("number columns: \n%s" % num_columns)
    # print("object columns: \n%s" % obj_columns)
    return obj_columns, num_columns


def find_missing(columns, train_df: pd.DataFrame, test_df: pd.DataFrame):
    missing_train_X_columns = [i for i in columns if train_df[i].isnull().any()]
    missing_test_X_columns = [i for i in columns if test_df[i].isnull().any()]

    missing_X_columns = list(set(missing_train_X_columns + missing_test_X_columns))
    fill_X_columns = list(set(columns) - set(missing_X_columns))
    return missing_X_columns, fill_X_columns

In [20]:
obj_columns, num_columns = data_split(train_X)
missing_obj_X_columns, fill_obj_X_columns = find_missing(obj_columns, train_X, test_X)
missing_num_X_columns, fill_num_X_columns = find_missing(num_columns, train_X, test_X)
fill_X_columns = fill_obj_X_columns + fill_num_X_columns
all_columns = fill_X_columns + missing_obj_X_columns + missing_num_X_columns

In [21]:
def fill_missing(missing_cols, fill_cols, train_df: pd.DataFrame, test_df: pd.DataFrame, is_obj=False):
    """填充缺失值，使用逻辑回归填充"""
    df = train_df.copy()
    df2 = test_df.copy()
    for column in missing_cols:
        temp_train_df = df.loc[df[column].notnull()]
        temp_target_df = df.loc[df[column].isnull()]
        
        temp_test_df = df2.loc[df2[column].notnull()]
        temp_test_target_df = df2.loc[df2[column].isnull()]
        
        if is_obj:
            temp_train_df, temp_test_df = label_encoder(column, temp_train_df, temp_test_df)  # object型数据转成数值标签
            model = RandomForestClassifier()  # 标签数据用分类模型处理
        else:
            model = RandomForestRegressor()  # 数值数据用回归模型处理
        
        # model = LogisticRegression()
        model.fit(temp_train_df[fill_cols], temp_train_df[column])
        
        if temp_target_df.shape[0] != 0:
            temp_target_df[column] = model.predict(temp_target_df[fill_cols])
        if temp_test_target_df.shape[0] != 0:
            temp_test_target_df[column] = model.predict(temp_test_target_df[fill_cols])
        
        df = temp_train_df.append(temp_target_df)
        df2 = temp_test_df.append(temp_test_target_df)
        # print(df[[column]].info())
    return df, df2


def label_encoder(column, df: pd.DataFrame, test_df: pd.DataFrame):
    """object型数据转成数值标签"""
    # temp_df = df.copy()
    # temp_test_df = test_df.copy()
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column].astype(str))
    # print(label_encoder.classes_)

    test_df[column] = test_df[column].map(lambda s: '<unknown>' if s not in label_encoder.classes_ else s)
    label_encoder.classes_ = np.append(label_encoder.classes_, '<unknown>')

    test_df[column] = label_encoder.transform(test_df[column].astype(str))
    # return temp_df, temp_test_df
    return df, test_df

    
def label_encoders(columns, train_df, test_df):
    temp_df = train_df.copy()
    temp_test_df = test_df.copy()
    if isinstance(columns, (str, int)):
        columns = [columns]
    for column in columns:
        label_encoder(column, temp_df, temp_test_df)
    return temp_df, temp_test_df

In [22]:
label_obj_train_X, label_obj_test_X = label_encoders(fill_obj_X_columns, train_X, test_X)
fill_num_train_X, fill_num_test_X = fill_missing(
    missing_num_X_columns, fill_X_columns, 
    label_obj_train_X, label_obj_test_X, 
    is_obj=False)
fill_all_train_X, fill_all_test_X = fill_missing(
    missing_obj_X_columns, fill_X_columns, 
    fill_num_train_X, fill_num_test_X, 
    is_obj=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [23]:
rfr = RandomForestRegressor()
rfr.fit(fill_all_train_X[all_columns], train_y)
pred_y = rfr.predict(fill_all_test_X[all_columns])
mae2 = mean_absolute_error(test_y, pred_y)
mae2

59075.06760273972

### 使用测试数据生成提交结果

In [24]:
train_X = train_data.drop(target, axis=1)
train_y = train_data[target]
test_X = test_data.copy()

obj_columns, num_columns = data_split(train_X)
missing_obj_X_columns, fill_obj_X_columns = find_missing(obj_columns, train_X, test_X)
missing_num_X_columns, fill_num_X_columns = find_missing(num_columns, train_X, test_X)
fill_X_columns = fill_obj_X_columns + fill_num_X_columns
all_columns = fill_X_columns + missing_obj_X_columns + missing_num_X_columns

label_obj_train_X, label_obj_test_X = label_encoders(fill_obj_X_columns, train_X, test_X)
fill_num_train_X, fill_num_test_X = fill_missing(
    missing_num_X_columns, fill_X_columns, 
    label_obj_train_X, label_obj_test_X, 
    is_obj=False)
fill_all_train_X, fill_all_test_X = fill_missing(
    missing_obj_X_columns, fill_X_columns, 
    fill_num_train_X, fill_num_test_X, 
    is_obj=True)

rfr = RandomForestRegressor()
rfr.fit(fill_all_train_X[all_columns], train_y)
pred_y = rfr.predict(fill_all_test_X[all_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [25]:
output2 = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": pred_y,
})
# output.to_csv('submission.csv', index=False)
output2

Unnamed: 0,Id,SalePrice
0,1461,179488.43
1,1462,182579.95
2,1463,196635.04
3,1464,169680.64
4,1465,185515.67
...,...,...
1454,2915,200119.36
1455,2916,210837.02
1456,2917,194421.55
1457,2918,200922.09


In [26]:
results["method_2"] = {
    "mae": mae2,
    "output": output2
}

In [27]:
for k, v in results.items():
    print(k, v['mae'])

method_1 17060.15941780822
method_2 59075.06760273972


### 返回结果

In [29]:
results["method_1"]["output"].to_csv('submission.csv', index=False)

### 总结
模型预测缺失值的方法并效果很差，原因是未深入理解各个参数的意义，有些参数并不适合用模型预测，如参数数据量较少的，或者与其他数据明显无线性关系的，但该结果也在意料之内，这么做主要也是为了尝试用该方法解题，如真要这样做，还需提前研究各个参数之间的相关性，再考虑用何种方式填充缺失值。