In [35]:
import pickle
from datetime import datetime

import numpy as np
import scipy as sp
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, roc_auc_score
from sklearn import cross_validation
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
import mlxtend

In [36]:
from mlxtend.regressor import StackingRegressor
from mlxtend.data import boston_housing_data
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor

In [37]:
def loss_score(predict, real):
    f = (real - predict)/real
    n = len(f)
    f = f.replace([np.nan, -np.nan], 0)
    score = 1 - np.abs(f).sum()/n
    return score 

In [38]:
model_path = '../dataset/fetures/model.pkl'

In [39]:
fetures = pd.read_csv('../dataset/fetures/train_feture.csv')

In [40]:
train_x = fetures.drop(['predict_date', 'predict_power_consumption'], axis=1)

In [41]:
train_y = fetures['predict_power_consumption']

###  使用 gridsearch 调参

In [43]:
parameters = {
    'n_estimators': [10, 100, 200, 300, 500],
#     'criterion': ['mse', 'mae'],
    'max_features': [10, 50, 100, 200, 'sqrt']
}

In [44]:
# model = RandomForestRegressor(n_jobs=-1, oob_score=True)

In [45]:
# cv = cross_validation.ShuffleSplit(train_x.shape[0], n_iter=8, test_size=0.1, random_state=0)
# clf = GridSearchCV(model, parameters, cv=cv)

In [46]:
# clf.fit(train_x.fillna(0), train_y)

In [47]:
# model = RandomForestRegressor(n_jobs=-1, oob_score=True, random_state=666, **clf.best_params_)

In [48]:
# cv = cross_validation.ShuffleSplit(train_x.shape[0], n_iter=5, test_size=0.1, random_state=666)
# r2_score = cross_validation.cross_val_score(model, train_x.fillna(0), train_y, cv=cv)
# r2_score

In [288]:
cv = cross_validation.ShuffleSplit(train_x.shape[0], n_iter=10, test_size=0.2, random_state=666)

In [290]:
forest = RandomForestRegressor(
    n_jobs=-1,
#     criterion='mae',
    random_state=1,
    oob_score=True,
    n_estimators=100,
    max_features='auto')
fores = RandomForestRegressor(
    n_jobs=-1,
    random_state=2,
    oob_score=True,
    n_estimators=300,
    max_features=300)
extract = ExtraTreesRegressor(
    n_jobs=-1,
    random_state=3,
#     criterion='mae',
    bootstrap=True,
    oob_score=True,
    n_estimators=100,
    max_features='auto')
extrac = ExtraTreesRegressor(
    n_jobs=-1,
    random_state=4,
    bootstrap=True,
    oob_score=True,
    n_estimators=300,
    max_features=300)

In [292]:
clfs = [forest, fores, extract, extrac]

In [293]:
params = {
    'forest__n-estimators': [10, 50, 100],
    'forest__max-features': [10, 50, 100],
    'fores__n-estimators': [100, 200, 500],
    'fores__max-features': [100, 200, 300],
    'extract__n-estimators': [10, 50, 100],
    'extract__max-features': [10, 50, 100],
    'extrac__n-estimators': [100, 200, 500],
    'extrac__max-features': [100, 200, 300],
}

In [294]:
meta = LinearRegression(n_jobs=-1)

In [295]:
stacker = StackingRegressor(regressors=clfs, meta_regressor=meta, verbose=True)

In [296]:
grid = GridSearchCV(estimator=stacker, param_grid=params, cv=cv)

In [297]:
extract.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 3,
 'verbose': 0,
 'warm_start': False}

In [298]:
# grid.fit(train_x.fillna(0), train_y)

In [300]:
r2_score = cross_validation.cross_val_score(stacker, train_x, train_y, cv=cv, verbose=True, n_jobs=-1)

Fitting 4 regressors...
Fitting regressor1: randomforestregressor (1/4)
Fitting 4 regressors...
Fitting regressor1: randomforestregressor (1/4)
Fitting 4 regressors...
Fitting regressor1: randomforestregressor (1/4)
Fitting 4 regressors...
Fitting regressor1: randomforestregressor (1/4)
Fitting regressor2: randomforestregressor (2/4)
Fitting regressor2: randomforestregressor (2/4)
Fitting regressor2: randomforestregressor (2/4)
Fitting regressor2: randomforestregressor (2/4)
Fitting regressor3: extratreesregressor (3/4)
Fitting regressor3: extratreesregressor (3/4)
Fitting regressor3: extratreesregressor (3/4)
Fitting regressor3: extratreesregressor (3/4)
Fitting regressor4: extratreesregressor (4/4)
Fitting regressor4: extratreesregressor (4/4)
Fitting regressor4: extratreesregressor (4/4)
Fitting regressor4: extratreesregressor (4/4)
Fitting 4 regressors...
Fitting regressor1: randomforestregressor (1/4)
Fitting 4 regressors...
Fitting regressor1: randomforestregressor (1/4)
Fitting 

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.1min finished


In [301]:
# grid.fit(train_x.fillna(0), train_y)

In [302]:
r2_score.mean(), r2_score.max(), r2_score.min()

(0.78898896165565757, 0.86008654041103993, 0.68182410751042843)

In [303]:
stacker.fit(train_x.fillna(0), train_y)

Fitting 4 regressors...
Fitting regressor1: randomforestregressor (1/4)
Fitting regressor2: randomforestregressor (2/4)
Fitting regressor3: extratreesregressor (3/4)
Fitting regressor4: extratreesregressor (4/4)


StackingRegressor(meta_regressor=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False),
         regressors=[RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_s...n_estimators=300, n_jobs=-1,
          oob_score=True, random_state=4, verbose=0, warm_start=False)],
         verbose=True)

In [304]:
r2_all_scorr = stacker.score(train_x.fillna(0), train_y)

In [None]:
# importance_df = pd.DataFrame(model.feature_importances_, index=train_x.columns)

# importance_df.sort_values(0, ascending=False, inplace=True)

# importance_top20 = ' , '.join(['{}:{} '.format(x, importance_df.loc[x].values[0]) for x in importance_df.index][:20])
# importance_top20

In [305]:
predict_result = stacker.predict(train_x.fillna(0))
loss_s  = loss_score(predict_result, train_y)
loss_s

0.9876500688451734

In [306]:
result = loss_s, r2_all_scorr, r2_score.mean(), r2_score.max(), r2_score.min()
result

(0.9876500688451734,
 0.98078256144556675,
 0.78898896165565757,
 0.86008654041103993,
 0.68182410751042843)

In [307]:
print(datetime.now(), 'result:', result)
print(
    datetime.now(),
    'result:', result,
    file=open('result.txt', 'a+'),
    sep=' ',
    end='\n')

2017-06-13 07:55:11.228802 result: (0.9876500688451734, 0.98078256144556675, 0.78898896165565757, 0.86008654041103993, 0.68182410751042843)


In [308]:
pickle.dump(stacker, open(model_path, 'wb'))