In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
import numpy as np
import time
from sklearn.decomposition import PCA

In [2]:
df = pd.read_hdf('Global_Weather_Data/big_data.h5')
df.head()

Unnamed: 0,mean_temperature_7,mean_temperature_6,mean_temperature_5,mean_temperature_4,mean_temperature_3,mean_temperature_2,mean_temperature_1,day_of_year,altitude,latitude,...,wind_speed_0_2,wind_speed_0_3,wind_speed_0_4,wind_speed_0_5,wind_speed_0_6,wind_speed_0_7,wind_speed_0_8,wind_speed_0_9,wind_speed_0_10,wind_speed_0_11
3,299.906042,301.140417,299.718333,299.908125,300.634792,302.856458,302.795433,275,1148,29.55805,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
6,296.5875,296.475,297.02,296.770833,297.6525,297.2475,301.199792,276,2474,31.769039,...,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
7,298.19,297.6075,298.1175,298.035417,298.8675,298.6775,300.139167,276,46,33.005859,...,6.0,6.0,6.0,6.0,3.0,3.0,3.0,3.0,3.0,3.0
8,298.19,297.6075,298.1175,298.035417,298.8675,298.6775,300.197708,276,33,32.815559,...,6.0,6.0,6.0,6.0,3.0,3.0,3.0,3.0,3.0,3.0
9,298.917917,299.906042,301.140417,299.718333,299.908125,300.634792,302.856458,276,1148,29.55805,...,5.0,3.0,3.0,7.0,7.0,7.0,6.0,5.0,8.0,6.0


In [3]:
try:
    n_pred = int(df.columns[0][-2:])
except:
    n_pred = int(df.columns[0][-1])

In [4]:
df_mean = df.mean()
df_std = df.std()
df = (df-df.mean())/df.std()
df_train, df_test = train_test_split(df,test_size=0.1,random_state=1)
X_train = df_train.iloc[:,n_pred:].reset_index(drop=True)
# X_train_mean = X_train.mean()
# X_train_std = X_train.std()
# X_train = (X_train-X_train.mean())/X_train.std()

y_train = df_train.iloc[:,:n_pred].reset_index(drop=True)

X_test = df_test.iloc[:,n_pred:].reset_index(drop=True)
# X_test_mean = X_test.mean()
# X_test_std = X_test.std()
# X_test = (X_test-X_test.mean())/X_test.std()

y_test = df_test.iloc[:,:n_pred].reset_index(drop=True)

In [5]:
n_comp = 81

pca = PCA(n_components=n_comp)
pca.fit(X_train)
X_train_orig = X_train.copy()
X_test_orig = X_test.copy()
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [6]:
# grid = {'n_estimators':np.arange(10,301,10), 'max_depth':np.arange(1,30,1),'learning_rate':[1e-3,1e-2,1e-1]}
# grid = {'n_estimators':np.arange(10,201,10), 'max_depth':np.arange(1,20,1),'learning_rate':[1e-1,1e-2,1e-3]}
# grid = {'n_estimators':np.arange(500,1501,100), 'max_depth':np.arange(1,10,2),'learning_rate':np.arange(0.05,0.16,0.02)}
# grid = {'n_estimators':[500], 'max_depth':np.arange(6,9,1),'learning_rate':[0.1]}
# grid = {'n_estimators':np.arange(400,1001,100), 'max_depth':np.arange(5,11,1),'learning_rate':[0.1]}
grid = {'n_estimators':[500], 'max_depth':[7],'learning_rate':[0.1]}
gbt = GradientBoostingRegressor()
gbtCV = GridSearchCV(gbt,param_grid=grid,return_train_score=True,n_jobs=-1,verbose=1)

In [7]:
for i in range(n_pred,0,-1):
    start = time.time()
    idx = n_pred-i
    gbtCV.fit(X_train,y_train.iloc[:,idx])
    score = gbtCV.best_estimator_.score(X_test,y_test.iloc[:,idx])
    print('Forecasting mean temperature',i,'days ahead with 24 hours of history:')
    print('\tBest depth:\t\t', gbtCV.best_params_['max_depth'], '\n\tBest # estimators:\t', gbtCV.best_params_['n_estimators'],
          '\n\tBest learning rate:\t', gbtCV.best_params_['learning_rate'], '\n\tTraining R^2\t =',gbtCV.best_score_)
    print('\n\tValidation R^2\t=',score)
    
    baseline_score = r2_score(y_test.iloc[:,idx],X_test_orig.iloc[:,1:25].mean(axis=1))
    print('\tBaseline\t=',baseline_score)
    if score > baseline_score:
        print('\tGood!')
    else:
        print('\tBad!')
    end = time.time()
    print('\nTotal minutes =',(end-start)/60)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean temperature 7 days ahead with 24 hours of history:
	Best depth:		 7 
	Best # estimators:	 500 
	Best learning rate:	 0.1 
	Training R^2	 = 0.8359915762684796

	Validation R^2	= 0.8392689788871186
	Baseline	= 0.7737052927346195
	Good!

Total minutes = 6.923977967103323
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean temperature 6 days ahead with 24 hours of history:
	Best depth:		 7 
	Best # estimators:	 500 
	Best learning rate:	 0.1 
	Training R^2	 = 0.8420456870831512

	Validation R^2	= 0.8360426392911923
	Baseline	= 0.7859239331265391
	Good!

Total minutes = 6.9259308218956
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean temperature 5 days ahead with 24 hours of history:
	Best depth:		 7 
	Best # estimators:	 500 
	Best learning rate:	 0.1 
	Training R^2	 = 0.851759927262183

	Validation R^2	= 0.8513596980313405
	Baseline	= 0.7897306522208862
	Good!

To

KeyboardInterrupt: 

In [None]:
features = pd.DataFrame()
features['names'] = gbtCV.best_estimator_.feature_names_in_
features['importance'] = gbtCV.best_estimator_.feature_importances_
features.sort_values(by='importance',ascending=False)

In [None]:
gbtCV.best_estimator_.predict(X_test.iloc[0:1,:])

In [None]:
# df.mean()

In [None]:
# df.std()

In [None]:
# y_test.iloc[0:1,:]

In [None]:
fun = features.sort_values(by='importance',ascending=False).reset_index(drop=True)
for i in range(features.shape[0]):
    print(fun.names[i],fun.importance[i])