# 树模型

In [1]:
import sys
sys.path.append('/Users/apple/Documents/ML_Project/ML - 2.1/')
import numpy as np
import pandas as pd
from module.utils import *
from ngboost.learners import *
from tqdm.notebook import tqdm as tqdm
from sklearn.metrics.regression import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'

  import pandas.util.testing as tm


In [2]:
def tree_model_plot1(best_param):
    X_train, X_test, Y_train, Y_test, Y_scaler = \
    get_data(hour_num=int(best_param['hour_num']),
             transform=best_param['transform'],
             drop_time=bool(best_param['transform']), 
             scale=True, return_y_scaler=True)
    Y_pred = DecisionTreeRegressor(max_depth=best_param['max_depth']).\
        fit(X_train, Y_train).predict(X_test)
    mse = mean_squared_error(Y_pred, Y_test)
    pd.concat([pd.Series(Y_pred, name='Pred', 
                         index=Y_test.index), Y_test], axis=1)\
    .plot(title='mse:'+str(round(mse, 8))+\
          '   depth:'+str(best_param['max_depth']),
          figsize=(12, 5))

In [3]:
def tree_model_plot2(best_param):
    X_train, X_test, Y_train, Y_test, Y_scaler = \
    get_data2(hour_num=int(best_param['hour_num']),
             transform=best_param['transform'],
             drop_time=bool(best_param['transform']),
             drop_else=bool(best_param['drop_else']),
             scale=True, return_y_scaler=True)
    Y_pred = DecisionTreeRegressor(max_depth=best_param['max_depth']).\
        fit(X_train, Y_train).predict(X_test)
    mse = mean_squared_error(Y_pred, Y_test)
    pd.concat([pd.Series(Y_pred, name='Pred', 
                         index=Y_test.index), Y_test], axis=1)\
    .plot(title='mse:'+str(round(mse, 8))+\
          '   depth:'+str(best_param['max_depth']),
          figsize=(12, 5))

In [4]:
def tree_heatmap1(mse_df):
    f, ax= plt.subplots(figsize=(15,18),nrows=3)
    sns.heatmap(mse_df.groupby(['transform','hour_num'])['mse'].mean().unstack(),
                ax=ax[0], vmax=0.008, annot=True, fmt='.5f')
    sns.heatmap(mse_df.groupby(['transform','max_depth'])['mse'].mean().unstack(),
                ax=ax[1], vmax=0.008, annot=True, fmt='.3f')
    sns.heatmap(mse_df.groupby(['transform','drop_time'])['mse'].mean().unstack(),
            ax=ax[2], vmax=0.008, annot=True, fmt='.5f')

In [5]:
def tree_heatmap2(mse_df):
    f, ax= plt.subplots(figsize=(15,24),nrows=4)
    sns.heatmap(mse_df.groupby(['transform','hour_num'])['mse'].mean().unstack(),
                ax=ax[0], vmax=0.008, annot=True, fmt='.5f')
    sns.heatmap(mse_df.groupby(['transform','max_depth'])['mse'].mean().unstack(),
                ax=ax[1], vmax=0.008, annot=True, fmt='.3f')
    sns.heatmap(mse_df.groupby(['transform','drop_time'])['mse'].mean().unstack(),
                ax=ax[2], vmax=0.008, annot=True, fmt='.5f')
    sns.heatmap(mse_df.groupby(['transform','drop_else'])['mse'].mean().unstack(),
                ax=ax[3], vmax=0.008, annot=True, fmt='.5f')

In [6]:
def tree_grid_search1(param_grid, plot=True, heatmap=True):
    mse_df = pd.DataFrame()
    for transform in tqdm(param_grid['transform']):
        for hour_num in param_grid['hour_num']:
            for drop_time in param_grid['drop_time']:
                X_train, X_test, Y_train, Y_test = \
                get_data(hour_num=hour_num, transform=transform, 
                         drop_time=drop_time, scale=True, verbose=False)
                for max_depth in param_grid['max_depth']:
                    Y_pred = DecisionTreeRegressor(max_depth=max_depth).\
                    fit(X_train, Y_train).predict(X_test)
                    mse = mean_squared_error(Y_pred, Y_test)
                    new_data = {'transform': transform,
                                'hour_num': hour_num,
                                'drop_time': drop_time,
                                'max_depth': max_depth,
                                'mse':mse}
                    mse_df = mse_df.append(new_data, ignore_index=True)  
    if plot:
        tree_model_plot1(dict(mse_df.iloc[mse_df['mse'].idxmin()]))
    if heatmap:
        tree_heatmap1(mse_df)

    mse_df['transform'].replace({None: 'None'}, inplace=True)
    print('best_param:\n', dict(mse_df.iloc[mse_df['mse'].idxmin()]),
      '\n\nbest_mse:', mse_df['mse'].min())
    return mse_df, dict(mse_df.iloc[mse_df['mse'].idxmin()])

In [7]:
def tree_grid_search2(param_grid, plot=True, heatmap=True):
    mse_df = pd.DataFrame()
    for transform in tqdm(param_grid['transform']):
        for hour_num in param_grid['hour_num']:
            for drop_time in param_grid['drop_time']:
                X_train, X_test, Y_train, Y_test = \
                get_data(hour_num=hour_num, transform=transform, 
                         drop_time=drop_time, scale=True, verbose=False)
                for drop_else in param_grid['drop_else']:
                    for max_depth in param_grid['max_depth']:
                        Y_pred = DecisionTreeRegressor(max_depth=max_depth).\
                        fit(X_train, Y_train).predict(X_test)
                        mse = mean_squared_error(Y_pred, Y_test)
                        new_data = {'transform': transform,
                                    'hour_num': hour_num,
                                    'drop_time': drop_time,
                                    'drop_else': drop_else,
                                    'max_depth': max_depth,
                                    'mse':mse}
                        mse_df = mse_df.append(new_data, ignore_index=True)  
    if plot:
        tree_model_plot2(dict(mse_df.iloc[mse_df['mse'].idxmin()]))  
    if heatmap:
        tree_heatmap2(mse_df)
            
    mse_df['transform'].replace({None: 'None'}, inplace=True)
    print('best_param:\n', dict(mse_df.iloc[mse_df['mse'].idxmin()]),
      '\n\nbest_mse:', mse_df['mse'].min())
    return mse_df, dict(mse_df.iloc[mse_df['mse'].idxmin()])

## 西班牙数据

In [None]:
param_grid = {'transform': [None, 'sin', 'cos', 'sin+cos', 'ws*sin(wd)', 
                            'ws*cos(wd)', 'ws*sin(wd)+ws*cos(wd)'],
              'hour_num': np.arange(0,12),
              'drop_time': [True, False],
              'max_depth': np.arange(1,20)}
mse_df, best_param = tree_grid_search1(param_grid)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

In [None]:
tree_model_plot1(best_param)

In [None]:
f, ax= plt.subplots(figsize=(15,18),nrows=3)
sns.heatmap(mse_df.groupby(['transform','hour_num'])['mse'].mean().unstack(),
            ax=ax[0], vmax=0.008, annot=True, fmt='.5f')
sns.heatmap(mse_df.groupby(['transform','max_depth'])['mse'].mean().unstack(),
            ax=ax[1], vmax=0.008, annot=True, fmt='.3f')
sns.heatmap(mse_df.groupby(['transform','drop_time'])['mse'].mean().unstack(),
            ax=ax[2], vmax=0.008, annot=True, fmt='.5f')

___

## 美国数据

In [None]:
param_grid = {'transform': [None, 'sin', 'cos', 'sin+cos', 'ws*sin(wd)', 
                            'ws*cos(wd)', 'ws*sin(wd)+ws*cos(wd)'],
              'hour_num': np.arange(0,12),
              'drop_time': [True, False],
              'drop_else': [True, False],
              'max_depth': np.arange(1,20)}
mse_df2, best_param2 = tree_grid_search2(param_grid)

In [None]:
f, ax= plt.subplots(figsize=(15,24),nrows=4)
sns.heatmap(mse_df2.groupby(['transform','hour_num'])['mse'].mean().unstack(),
            ax=ax[0], vmax=0.008, annot=True, fmt='.5f')
sns.heatmap(mse_df2.groupby(['transform','max_depth'])['mse'].mean().unstack(),
            ax=ax[1], vmax=0.008, annot=True, fmt='.3f')
sns.heatmap(mse_df2.groupby(['transform','drop_time'])['mse'].mean().unstack(),
            ax=ax[2], vmax=0.008, annot=True, fmt='.5f')
sns.heatmap(mse_df2.groupby(['transform','drop_else'])['mse'].mean().unstack(),
            ax=ax[3], vmax=0.008, annot=True, fmt='.5f')