In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
import numpy as np
import time
import pickle

# Instructions:
## Change the inputs to your desired values and run the entire notebook. The trained model will be stored in the pickled_models folder.
# Change inputs here:
## grid_list is a list of hyperparameter dictionaries you want to use in the model. You should make one dictionary for each element of n_pred in Preprocessing_Day_Selection_Events. grid_list corresponds to n_pred in reverse order (e.g. if n_pred = [1,4,7], then grid_list should be [{hyperparams for day 7}, {hyperparams for day 4}, {hyperparams for day 1}]). You should use this even if you aren't doing an actual gridsearch, by just using single element lists in the dictionaries (as shown below).

In [None]:
grid_list = [{'n_estimators':[500], 'max_depth':[5],'learning_rate':[0.01]},
            {'n_estimators':[600], 'max_depth':[7],'learning_rate':[0.01]},
            {'n_estimators':[600], 'max_depth':[6],'learning_rate':[0.01]}]

## frac is the fraction of data to use during training (1 = 100%, 0.5 = 50%, etc.)

In [1]:
frac = 0.1

# Do not change any code below this cell!

In [2]:
df = pd.read_hdf('Global_Weather_Data/big_data.h5')
df.head()

Unnamed: 0,event_1,day_of_year,altitude,latitude,event,temperature_0_12,temperature_0_13,temperature_0_14,temperature_0_15,temperature_0_16,...,wind_direction_0_2,wind_direction_0_3,wind_direction_0_4,wind_direction_0_5,wind_direction_0_6,wind_direction_0_7,wind_direction_0_8,wind_direction_0_9,wind_direction_0_10,wind_direction_0_11
32,clear,275,1148,29.55805,cloudy,309.1,310.58,310.495769,310.411538,310.327308,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
37,clear,276,50,45.523449,cloudy,282.272771,282.281385,282.29,282.51,284.44,...,310.0,320.0,290.0,310.0,320.0,310.0,10.0,350.0,320.0,320.0
38,cloudy,276,52,37.774929,clear,289.158749,289.144375,289.13,290.73,293.02,...,270.0,290.0,280.0,250.0,270.0,0.0,210.0,0.0,0.0,0.0
39,clear,276,174,47.606209,clear,281.634768,281.627384,281.62,282.71,285.05,...,0.0,40.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,fog,276,305,34.052231,clear,291.750682,291.745341,291.74,293.97,295.59,...,280.0,0.0,0.0,0.0,0.0,0.0,118.0,0.0,0.0,125.0


In [3]:
exit = False
n_pred = []
i = 0
while not exit:
    try:
        n = int(df.columns[i][-2:])
        n_idx = -3
    except:
        n = int(df.columns[i][-1])
        n_idx  =-2
        
    if i == 0:
        target = df.columns[0][:n_idx].replace('_',' ')
    n_pred.append(n)
    i = i + 1
    if target != df.columns[i][:len(target)].replace('_',' '):
        exit = True

In [4]:
if target == 'event':
    df, junk = train_test_split(df,train_size=frac,random_state=0,stratify=df.iloc[:,0])
    df_train, df_test = train_test_split(df,test_size=0.1,random_state=1,stratify=df.iloc[:,0])
else:
    df = df.sample(frac=frac)
    df_train, df_test = train_test_split(df,test_size=0.1,random_state=1)

In [6]:
if target == 'event':
    train_pred = df_train['event']
    df_train = df_train.drop(['event'],axis=1)
    test_pred = df_test['event']
    df_test = df_test.drop(['event'],axis=1)
X_train = df_train.iloc[:,len(n_pred):].reset_index(drop=True)
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train-X_train.mean())/X_train.std()

y_train = df_train.iloc[:,:len(n_pred)].reset_index(drop=True)

X_test = df_test.iloc[:,len(n_pred):].reset_index(drop=True)
X_test_mean = X_test.mean()
X_test_std = X_test.std()
X_test = (X_test-X_test.mean())/X_test.std()

y_test = df_test.iloc[:,:len(n_pred)].reset_index(drop=True)

In [7]:
if target == 'event':
    gbt = GradientBoostingClassifier()
else:
    gbt = GradientBoostingRegressor()

In [8]:
for i in range(len(n_pred)):
    
    grid = grid_list[i]
    gbtCV = GridSearchCV(gbt,param_grid=grid,return_train_score=True,n_jobs=-1,verbose=1)
    
    start = time.time()
    idx = i
    gbtCV.fit(X_train,y_train.iloc[:,idx])
    
    filename = './pickled_models/'+target+'_'+str(n_pred[i])+'_gbtCV.pkl'
    with open(filename,'wb') as file:
        pickle.dump(gbtCV,file)
    
    score = gbtCV.best_estimator_.score(X_test,y_test.iloc[:,idx])
    print('Forecasting',target,n_pred[i],'days ahead with 24 hours of history:')
    print('\tBest depth:\t\t', gbtCV.best_params_['max_depth'], '\n\tBest # estimators:\t', gbtCV.best_params_['n_estimators'],
          '\n\tBest learning rate:\t', gbtCV.best_params_['learning_rate'])
    if target == 'event':
        print('\tTraining Acc.\t =',gbtCV.best_score_)
        print('\n\tValidation Acc.\t=',score)
        baseline_score = accuracy_score(y_test.iloc[:,idx],test_pred)
    else:
        print('\tTraining R^2\t =',gbtCV.best_score_)
        print('\n\tValidation R^2\t=',score)
        baseline_score = r2_score(y_test.iloc[:,idx],
                                  (X_test_std[3:3+24]*X_test.iloc[:,3:3+24]+X_test_mean[3:3+24]).mean(axis=1))
    print('\tBaseline\t=',baseline_score)
    if score > baseline_score:
        print('\tGood!')
    else:
        print('\tBad!')
    end = time.time()
    print('\nTotal minutes =',(end-start)/60)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting event 1 days ahead with 24 hours of history:
	Best depth:		 5 
	Best # estimators:	 500 
	Best learning rate:	 0.01
	Training Acc.	 = 0.525892191553844

	Validation Acc.	= 0.538961038961039
	Baseline	= 0.5308441558441559
	Good!

Total minutes = 10.18308403491974


In [9]:
features = pd.DataFrame()
features['names'] = gbtCV.best_estimator_.feature_names_in_
features['importance'] = gbtCV.best_estimator_.feature_importances_
features.sort_values(by='importance',ascending=False)

Unnamed: 0,names,importance
2,latitude,0.060821
1,altitude,0.055252
98,humidity_0_11,0.039898
0,day_of_year,0.030879
121,wind_direction_0_10,0.023807
...,...,...
66,wind_speed_0_3,0.002121
68,wind_speed_0_5,0.001876
52,wind_speed_0_13,0.001821
42,pressure_0_3,0.001770


In [14]:
fun = features.sort_values(by='importance',ascending=False).reset_index(drop=True)
for i in range(features.shape[0]):
    print(fun.names[i],fun.importance[i])

latitude 0.06082057547743305
altitude 0.055252098279730044
humidity_0_11 0.03989823321889601
day_of_year 0.030879108255036484
wind_direction_0_10 0.02380721326430756
humidity_0_10 0.022364496106936717
humidity_0_4 0.016230158199313477
wind_direction_0_8 0.01576260341145078
wind_direction_0_11 0.01534436618659886
humidity_0_12 0.014357919430320606
pressure_0_12 0.013984924214901282
wind_direction_0_19 0.01170341303286715
wind_direction_0_12 0.011300546438533409
humidity_0_16 0.010861468277957762
wind_direction_0_18 0.01058896383713304
wind_direction_0_9 0.010525710974521318
wind_direction_0_4 0.010489169056153906
wind_direction_0_2 0.010300523609784437
temperature_0_11 0.010276470822580834
temperature_0_18 0.010124639961390697
temperature_0_12 0.009838231616318225
humidity_0_1 0.009812689406267233
temperature_0_21 0.00976124308376766
wind_direction_0_15 0.009721801643133807
wind_direction_0_3 0.009593620684787063
wind_direction_0_20 0.009140823838167458
wind_direction_0_7 0.009019928744