In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
import numpy as np
import time

In [2]:
df = pd.read_hdf('Global_Weather_Data/big_data.h5')
df.head()

Unnamed: 0,mean_temperature_14,mean_temperature_13,mean_temperature_12,mean_temperature_11,mean_temperature_10,mean_temperature_9,mean_temperature_8,mean_temperature_7,mean_temperature_6,mean_temperature_5,...,wind_speed_0_2,wind_speed_0_3,wind_speed_0_4,wind_speed_0_5,wind_speed_0_6,wind_speed_0_7,wind_speed_0_8,wind_speed_0_9,wind_speed_0_10,wind_speed_0_11
2,301.82,301.871458,301.850625,300.702292,299.662292,299.835625,298.917917,299.906042,301.140417,299.718333,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
6,299.885833,299.978333,298.984167,299.5625,300.92125,300.1375,299.82875,299.03,298.639167,297.822917,...,0.0,0.0,0.0,0.0,1.0,3.0,2.0,0.0,0.0,1.0
7,299.733542,298.048542,298.555208,297.823958,298.996667,298.486042,297.319375,298.31125,298.098125,298.106667,...,2.0,2.0,2.0,2.0,1.0,1.0,3.0,2.0,2.0,6.0
8,302.018958,301.82,301.871458,301.850625,300.702292,299.662292,299.835625,298.917917,299.906042,301.140417,...,5.0,3.0,3.0,7.0,7.0,7.0,6.0,5.0,8.0,6.0
9,300.0325,298.79,299.42,298.7925,298.235417,298.437083,297.966667,298.19,297.6075,298.1175,...,6.0,6.0,6.0,6.0,3.0,3.0,3.0,3.0,3.0,3.0


In [3]:
try:
    n_pred = int(df.columns[0][-2:])
except:
    n_pred = int(df.columns[0][-1])

In [4]:
df = (df-df.mean())/df.std()
df_train, df_test = train_test_split(df,test_size=0.1,random_state=1)
X_train = df_train.iloc[:,n_pred:].reset_index(drop=True)
# X_train_mean = X_train.mean()
# X_train_std = X_train.std()
# X_train = (X_train-X_train.mean())/X_train.std()

y_train = df_train.iloc[:,:n_pred].reset_index(drop=True)

X_test = df_test.iloc[:,n_pred:].reset_index(drop=True)
# X_test_mean = X_test.mean()
# X_test_std = X_test.std()
# X_test = (X_test-X_test.mean())/X_test.std()

y_test = df_test.iloc[:,:n_pred].reset_index(drop=True)

In [5]:
# grid = {'n_estimators':np.arange(10,301,10), 'max_depth':np.arange(1,30,1),'learning_rate':[1e-3,1e-2,1e-1]}
# grid = {'n_estimators':np.arange(10,201,10), 'max_depth':np.arange(1,20,1),'learning_rate':[1e-1,1e-2,1e-3]}
grid = {'n_estimators':[100], 'max_depth':np.arange(1,10,1),'learning_rate':[1e-1]}
gbt = GradientBoostingRegressor()
gbtCV = GridSearchCV(gbt,param_grid=grid,return_train_score=True,n_jobs=-1)

In [6]:
start = time.time()
for i in range(n_pred,0,-1):
    idx = n_pred-i
    gbtCV.fit(X_train,y_train.iloc[:,idx])
    score = gbtCV.best_estimator_.score(X_test,y_test.iloc[:,idx])
    print('Forecasting mean temperature',i,'days ahead with 24 hours of history:')
    print('\tBest depth:\t\t', gbtCV.best_params_['max_depth'], '\n\tBest # estimators:\t', gbtCV.best_params_['n_estimators'],
          '\n\tBest learning rate:\t', gbtCV.best_params_['learning_rate'], '\n\tTraining R^2\t =',gbtCV.best_score_)
    print('\n\tValidation R^2\t=',score)
    
    baseline_score = r2_score(y_test.iloc[:,idx],X_test.iloc[:,1:25].mean(axis=1))
    print('\tBaseline\t=',baseline_score)
    if score > baseline_score:
        print('\tGood!')
    else:
        print('\tBad!')
end = time.time()
print('\nTotal minutes =',(end-start)/60)

Forecasting mean temperature 14 days ahead with 24 hours of history:
	Best depth:		 8 
	Best # estimators:	 100 
	Best learning rate:	 0.1 
	Training R^2	 = 0.8943592960802599

	Validation R^2	= 0.9041700164866895
	Baseline	= 0.748296815345864
	Good!
Forecasting mean temperature 13 days ahead with 24 hours of history:
	Best depth:		 9 
	Best # estimators:	 100 
	Best learning rate:	 0.1 
	Training R^2	 = 0.8942649838313294

	Validation R^2	= 0.9026994439499756
	Baseline	= 0.7483561829457154
	Good!
Forecasting mean temperature 12 days ahead with 24 hours of history:
	Best depth:		 9 
	Best # estimators:	 100 
	Best learning rate:	 0.1 
	Training R^2	 = 0.8949336076915255

	Validation R^2	= 0.9007046115959263
	Baseline	= 0.7507046881625792
	Good!
Forecasting mean temperature 11 days ahead with 24 hours of history:
	Best depth:		 9 
	Best # estimators:	 100 
	Best learning rate:	 0.1 
	Training R^2	 = 0.8935998419839091

	Validation R^2	= 0.9003283148397857
	Baseline	= 0.7571057965164001


In [10]:
features = pd.DataFrame()
features['names'] = gbtCV.best_estimator_.feature_names_in_
features['importance'] = gbtCV.best_estimator_.feature_importances_
features.sort_values(by='importance',ascending=False)

Unnamed: 0,names,importance
19,temperature_0_6,0.563832
22,temperature_0_9,0.156497
9,temperature_0_20,0.072984
24,temperature_0_11,0.036051
8,temperature_0_19,0.030365
...,...,...
110,wind_speed_0_1,0.000068
108,wind_speed_0_23,0.000065
103,wind_speed_0_18,0.000065
113,wind_speed_0_4,0.000062


In [12]:
fun = features.sort_values(by='importance',ascending=False).reset_index(drop=True)
for i in range(features.shape[0]):
    print(fun.names[i],fun.importance[i])

temperature_0_6 0.563831992423123
temperature_0_9 0.15649700222021468
temperature_0_20 0.07298421552535117
temperature_0_11 0.03605054708151279
temperature_0_19 0.03036498645652807
temperature_0_7 0.024307462474779808
temperature_0_18 0.02131550646574846
temperature_0_21 0.017678836110840136
temperature_0_10 0.0157541232204862
temperature_0_5 0.006835693426220244
temperature_0_2 0.004901580932880021
temperature_0_17 0.004257076009989641
day_of_year 0.004037995059636061
temperature_0_1 0.0032509448535262115
temperature_0_22 0.003050122611286643
temperature_0_8 0.002001595464679648
temperature_0_0 0.0013768079202173666
temperature_0_4 0.001254329656701402
temperature_0_23 0.0012075565857267924
temperature_0_3 0.0010426472764065546
wind_direction_0_15 0.000820897503980744
wind_direction_0_11 0.0007288357078881286
humidity_0_22 0.0007108119894527506
temperature_0_12 0.0006624576836531255
humidity_0_13 0.0006227299759068776
temperature_0_14 0.000600642354054831
temperature_0_15 0.0005651512