### 预测数据处理

* 将数据转换成preprocess形状
* 填补空缺项（分类列填missing，数值列填nan）
* 剩余的分类列做one-hot编码
* 对数据进行标准化处理

In [58]:
import numpy as np
import pandas as pd
import joblib
import pickle
from xgboost import XGBClassifier

In [59]:
# 加载数据处理模型
imputer = joblib.load('./model/imputer.joblib')
scaler = joblib.load('./model/scaler.joblib')
with open('./model/NumericCols.pkl', 'rb') as f:
    NumericCols = pickle.load(f)

# 读取数据
input_data = pd.read_excel('./predict/predict_data.xlsx')
# 读取需要的列
preprocessed_data = pd.read_csv('./predict/preprocessed.csv', nrows=0)

print(input_data.shape)

# 将多余的列删除
for field in input_data.columns:
    if field not in preprocessed_data.columns:
        input_data = input_data.drop(field, axis=1)

print(preprocessed_data.shape)

# 添加宜居度
input_data['habitable'] = np.nan
input_data = input_data.set_index("rowid")

# 填充缺失值
input_data[input_data._get_numeric_data().columns] = imputer.transform(input_data[input_data._get_numeric_data().columns])

# 标准化
input_data[NumericCols] = scaler.transform(input_data[NumericCols]) 

open('./predict/predict_data_clearify.csv', 'w').close()

input_data.to_csv('./predict/predict_data_clearify.csv')

(2, 356)
(0, 97)


### 预测

In [60]:
model = XGBClassifier()
model.load_model('./model/xgb.model')

test = pd.read_csv('./predict/predict_data_clearify.csv')
remove = ['rowid', 'habitable']
test = test.drop(remove, axis = 1)

for i in range(len(test)):
    test_x = test.iloc[i].to_numpy()
    test_x = test_x.reshape(1, -1)
    # print(test_x)
    y_preds = model.predict(test_x)
    print(y_preds, end=' ')
print()

[0] [0] 
