In [2]:
import os
import numpy as np
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from string import punctuation
from tqdm.notebook import tqdm
from collections import Counter
import pickle

### 读取数据

In [3]:
x = []
y = []
for root, dirs, files in os.walk('data/train/', topdown=True):
    for file_name in files:
        if 'xls' not in file_name and 'csv' not in file_name:
            continue
        if 'xls' in file_name:
            columns = [
                '草地', '城市', '林地', '裸地', '湿地', '水体', 'NTL_mean', 'dem', 'road',
                'WI'
            ]
            data = pd.read_excel(os.path.join(
                root, file_name)).dropna(subset=columns)
            X = data[columns[:-1]]
            Y = data[columns[-1:]]
        else:
            columns = [
                '草地面积', '城市面积', '林地面积', '裸地面积', '湿地面积', '水体面积', 'NTL_mean',
                'dem', 'road_near', 'WI'
            ]
            data = pd.read_csv(os.path.join(root,
                                            file_name)).dropna(subset=columns)
            X = data[columns[:-1]]
            Y = data[columns[-1:]]
        x.append(np.array(X))
        y.append(np.array(Y))
x = np.concatenate(x, axis=0)
y = np.concatenate(y, axis=0)

X = []
Y = []
for idx in range(len(x)):
    try:
        temp = np.array(x[idx], dtype='float32')
        temp = np.array(y[idx], dtype='float32')
        X.append(list(x[idx]))
        Y.append(list(y[idx]))
    except:
        continue
X = np.array(X, dtype='float32')
Y = np.concatenate(Y, axis=0)

In [4]:
X.shape, Y.shape

((1239, 9), (1239,))

### 计算评估指标

In [5]:
def compute_metrics(labels, preds):
    # MSE
    mse = mean_squared_error(labels, preds)
    # MAE
    mae = mean_absolute_error(labels, preds)
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')

### 寻找模型最优超参

In [6]:
def get_best_hyperparam(train_x, train_y):
    estimators = [('dt',
                   DecisionTreeRegressor(criterion='poisson',
                                         max_features='sqrt')),
                  ('rf',
                   RandomForestRegressor(criterion='poisson',
                                         max_features='auto')),
                  ('xgb', XGBRegressor(max_depth=10, n_estimators=20))]
    model = StackingRegressor(estimators=estimators,
                              final_estimator=RandomForestRegressor(
                                  criterion='poisson', max_features='auto'))
    model.fit(train_x, train_y)
    return model

### 训练和测试

In [7]:
# 训练和测试
def train_and_eval(train_x, train_y, test_x, test_y, flag):
    # 训练模型
    clf = get_best_hyperparam(train_x, train_y)
    if os.path.exists(flag + '_model.pkl'):
        print('Model founded. Loading...')
        with open(flag + '_model.pkl', 'rb') as f:
            clf = pickle.load(f)
    else:
        print('No model founded. Creating...')
        with open(flag + '_model.pkl', 'wb') as f:
            pickle.dump(clf, f)
    # 测试
    pred = clf.predict(test_x)
    compute_metrics(test_y, pred)
    return clf

### 训练集验证集划分

In [8]:
train_x, test_x, train_y, test_y = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=1)

### 训练以及验证

In [9]:
model = train_and_eval(train_x, train_y, test_x, test_y, flag='Stacking')

Model founded. Loading...
MSE: 0.5920501442085196
MAE: 0.6007880883868988


### 测试

In [10]:
test_X = []
pred_dir = 'Stacking'
os.makedirs(pred_dir, exist_ok=True)
for root, dirs, files in os.walk('数据/2013-19/', topdown=True):
    for file_name in files:
        if 'xls' in file_name:
            data = pd.read_excel(os.path.join(root, file_name))
            columns = [
                '草地面积', '城市面积', '林地面积', '裸地面积', '湿地面积', '水体面积', 'NTL_mean',
                'dem'
            ]
            if 'road' in data:
                columns.append('road')
            else:
                columns.append('road_near')
            data = data.dropna(subset=columns)
            test_X = np.array(data[columns])
            pred = model.predict(test_X)
            data['WI'] = pred
            data.to_excel(os.path.join(pred_dir, file_name + '.pred.xlsx'),
                          index=False,
                          encoding='utf-8-sig')