In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import time
import pickle

# Instructions:
## Change the inputs to your desired values and run the entire notebook. The trained model will be stored in the pickled_models folder.
# Change inputs here:
## grid_list is a list of hyperparameter dictionaries you want to use in the model. You should make one dictionary for each element of n_pred in Preprocessing_Day_Selection_Events. grid_list corresponds to n_pred in reverse order (e.g. if n_pred = [1,4,7], then grid_list should be [{hyperparams for day 7}, {hyperparams for day 4}, {hyperparams for day 1}]). You should use this even if you aren't doing an actual gridsearch, by just using single element lists in the dictionaries (as shown below).

In [None]:
grid_list = [{'n_neighbors':[35],'weights':['distance'],'p':[1]},
             {'n_neighbors':[35],'weights':['distance'],'p':[1]},
             {'n_neighbors':[25],'weights':['distance'],'p':[1]}]

## frac is the fraction of data to use during training (1 = 100%, 0.5 = 50%, etc.)

In [None]:
frac = 1

# Do not change any code below this cell!

In [2]:
df = pd.read_hdf('Global_Weather_Data/big_data.h5')
df.head()

Unnamed: 0,mean_humidity_7,mean_humidity_4,mean_humidity_1,day_of_year,altitude,latitude,humidity_0_12,humidity_0_13,humidity_0_14,humidity_0_15,...,wind_direction_0_2,wind_direction_0_3,wind_direction_0_4,wind_direction_0_5,wind_direction_0_6,wind_direction_0_7,wind_direction_0_8,wind_direction_0_9,wind_direction_0_10,wind_direction_0_11
32,45.916667,43.583333,50.958333,275,1148,29.55805,25.0,22.0,22.0,22.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
38,69.833333,67.375,67.875,276,52,37.774929,65.0,64.0,63.0,51.0,...,270.0,290.0,280.0,250.0,270.0,0.0,210.0,0.0,0.0,0.0
40,63.583333,71.583333,74.291667,276,305,34.052231,88.0,88.0,88.0,88.0,...,280.0,0.0,0.0,0.0,0.0,0.0,118.0,0.0,0.0,125.0
41,62.208333,73.125,78.541667,276,62,32.715328,73.0,73.0,73.0,73.0,...,290.0,290.0,310.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,73.541667,74.875,63.625,276,650,29.42412,88.0,88.0,88.0,60.0,...,270.0,0.0,0.0,0.0,0.0,0.0,0.0,330.0,320.0,320.0


In [3]:
exit = False
n_pred = []
i = 0
while not exit:
    try:
        n = int(df.columns[i][-2:])
        n_idx = -3
    except:
        n = int(df.columns[i][-1])
        n_idx  =-2
        
    if i == 0:
        target = df.columns[0][:n_idx].replace('_',' ')
    n_pred.append(n)
    i = i + 1
    if target != df.columns[i][:len(target)].replace('_',' '):
        exit = True

In [4]:
if target == 'event':
    df, junk = train_test_split(df,train_size=frac,random_state=0,stratify=df.iloc[:,0])
    df_train, df_test = train_test_split(df,test_size=0.1,random_state=1,stratify=df.iloc[:,0])
else:
    df = df.sample(frac=frac)
    df_train, df_test = train_test_split(df,test_size=0.1,random_state=1)

In [6]:
if target == 'event':
    train_pred = df_train['event']
    df_train = df_train.drop(['event'],axis=1)
    test_pred = df_test['event']
    df_test = df_test.drop(['event'],axis=1)
X_train = df_train.iloc[:,len(n_pred):].reset_index(drop=True)
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train-X_train.mean())/X_train.std()

y_train = df_train.iloc[:,:len(n_pred)].reset_index(drop=True)

X_test = df_test.iloc[:,len(n_pred):].reset_index(drop=True)
X_test_mean = X_test.mean()
X_test_std = X_test.std()
X_test = (X_test-X_test.mean())/X_test.std()

y_test = df_test.iloc[:,:len(n_pred)].reset_index(drop=True)

In [7]:
knn = KNeighborsRegressor()

In [8]:
for i in range(len(n_pred)):
    
    grid = grid_list[i]
    knnCV = GridSearchCV(knn,param_grid=grid,return_train_score=True,n_jobs=-1,verbose=1)
    
    start = time.time()
    idx = i
    knnCV.fit(X_train,y_train.iloc[:,idx])
    
    filename = './pickled_models/'+target+'_'+str(n_pred[i])+'_knnCV.pkl'
    with open(filename,'wb') as file:
        pickle.dump(knnCV,file)
    
    score = knnCV.best_estimator_.score(X_test,y_test.iloc[:,idx])
    print('Forecasting',target,n_pred[i],'days ahead with 24 hours of history:')
    print('\t# Neighbors:\t', knnCV.best_params_['n_neighbors'])
    print('\tWeights:\t\t', knnCV.best_params_['weights'])
    print('\tp:\t\t\t', knnCV.best_params_['p'])
    if target == 'event':
        print('\tTraining Acc.\t =',knnCV.best_score_)
        print('\n\tValidation Acc.\t=',score)
        baseline_score = accuracy_score(y_test.iloc[:,idx],test_pred)
    else:
        print('\tTraining R^2\t =',knnCV.best_score_)
        print('\tValidation R^2\t=',score)
        baseline_score = r2_score(y_test.iloc[:,idx],
                                  (X_test_std[3:3+24]*X_test.iloc[:,3:3+24]+X_test_mean[3:3+24]).mean(axis=1))
    print('\tBaseline\t=',baseline_score)
    if score > baseline_score:
        print('\tGood!')
    else:
        print('\tBad!')
    end = time.time()
    print('\nTotal minutes =',(end-start)/60)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean humidity 7 days ahead with 24 hours of history:
	# Neighbors:	 35
	Weights:		 distance
	p:			 1
	Training R^2	 = 0.44524780398546193
	Validation R^2	= 0.46377541933506516
	Baseline	= 0.14361550734048678
	Good!

Total minutes = 4.190998995304108
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean humidity 4 days ahead with 24 hours of history:
	# Neighbors:	 35
	Weights:		 distance
	p:			 1
	Training R^2	 = 0.45786913540911867
	Validation R^2	= 0.470915493002531
	Baseline	= 0.1747261723898279
	Good!

Total minutes = 4.1897465268770855
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean humidity 1 days ahead with 24 hours of history:
	# Neighbors:	 25
	Weights:		 distance
	p:			 1
	Training R^2	 = 0.5599696261912783
	Validation R^2	= 0.5729164769703738
	Baseline	= 0.3693088662120948
	Good!

Total minutes = 4.143934361139933
