In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
import numpy as np
import time
import pickle

# Instructions:
## Change the inputs to your desired values and run the entire notebook. The trained model will be stored in the pickled_models folder.
# Change inputs here:
## frac is the fraction of data to use during training (1 = 100%, 0.5 = 50%, etc.)

In [None]:
frac = 1

# Do not change any code below this cell!

In [2]:
df = pd.read_hdf('Global_Weather_Data/big_data.h5')
df.head()

Unnamed: 0,mean_temperature_7,mean_temperature_4,mean_temperature_1,day_of_year,altitude,latitude,temperature_0_12,temperature_0_13,temperature_0_14,temperature_0_15,...,wind_direction_0_2,wind_direction_0_3,wind_direction_0_4,wind_direction_0_5,wind_direction_0_6,wind_direction_0_7,wind_direction_0_8,wind_direction_0_9,wind_direction_0_10,wind_direction_0_11
32,298.917917,299.718333,302.856458,275,1148,29.55805,309.1,310.58,310.495769,310.411538,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
37,285.735,288.664583,288.033958,276,50,45.523449,282.272771,282.281385,282.29,282.51,...,310.0,320.0,290.0,310.0,320.0,310.0,10.0,350.0,320.0,320.0
38,289.655833,288.943333,292.629167,276,52,37.774929,289.158749,289.144375,289.13,290.73,...,270.0,290.0,280.0,250.0,270.0,0.0,210.0,0.0,0.0,0.0
39,287.65,286.956667,284.51125,276,174,47.606209,281.634768,281.627384,281.62,282.71,...,0.0,40.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,292.36,293.704167,294.345833,276,305,34.052231,291.750682,291.745341,291.74,293.97,...,280.0,0.0,0.0,0.0,0.0,0.0,118.0,0.0,0.0,125.0


In [3]:
exit = False
n_pred = []
i = 0
while not exit:
    try:
        n = int(df.columns[i][-2:])
        n_idx = -3
    except:
        n = int(df.columns[i][-1])
        n_idx  =-2
        
    if i == 0:
        target = df.columns[0][:n_idx].replace('_',' ')
    n_pred.append(n)
    i = i + 1
    if target != df.columns[i][:len(target)].replace('_',' '):
        exit = True

In [4]:
if target == 'event':
    df, junk = train_test_split(df,train_size=frac,random_state=0,stratify=df.iloc[:,0])
    df_train, df_test = train_test_split(df,test_size=0.1,random_state=1,stratify=df.iloc[:,0])
else:
    df = df.sample(frac=frac)
    df_train, df_test = train_test_split(df,test_size=0.1,random_state=1)

In [6]:
if target == 'event':
    train_pred = df_train['event']
    df_train = df_train.drop(['event'],axis=1)
    test_pred = df_test['event']
    df_test = df_test.drop(['event'],axis=1)
X_train = df_train.iloc[:,len(n_pred):].reset_index(drop=True)
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train-X_train.mean())/X_train.std()

y_train = df_train.iloc[:,:len(n_pred)].reset_index(drop=True)

X_test = df_test.iloc[:,len(n_pred):].reset_index(drop=True)
X_test_mean = X_test.mean()
X_test_std = X_test.std()
X_test = (X_test-X_test.mean())/X_test.std()

y_test = df_test.iloc[:,:len(n_pred)].reset_index(drop=True)

In [7]:
grid = {}
lr = LinearRegression()

In [8]:
for i in range(len(n_pred)):
    
    lrCV = GridSearchCV(lr,param_grid=grid,return_train_score=True,n_jobs=-1,verbose=1)
    
    start = time.time()
    idx = i
    lrCV.fit(X_train,y_train.iloc[:,idx])
    
    filename = './pickled_models/'+target+'_'+str(n_pred[i])+'_lrCV.pkl'
    with open(filename,'wb') as file:
        pickle.dump(lrCV,file)
    
    score = lrCV.best_estimator_.score(X_test,y_test.iloc[:,idx])
    print('Forecasting',target,n_pred[i],'days ahead with 24 hours of history:')
    if target == 'event':
        print('\tTraining Acc.\t =',lrCV.best_score_)
        print('\n\tValidation Acc.\t=',score)
        baseline_score = accuracy_score(y_test.iloc[:,idx],test_pred)
    else:
        print('\tTraining R^2\t =',lrCV.best_score_)
        print('\n\tValidation R^2\t=',score)
        baseline_score = r2_score(y_test.iloc[:,idx],
                                  (X_test_std[3:3+24]*X_test.iloc[:,3:3+24]+X_test_mean[3:3+24]).mean(axis=1))
    print('\tBaseline\t=',baseline_score)
    if score > baseline_score:
        print('\tGood!')
    else:
        print('\tBad!')
    end = time.time()
    print('\nTotal minutes =',(end-start)/60)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean temperature 7 days ahead with 24 hours of history:
	Training R^2	 = 0.7484732452779739

	Validation R^2	= 0.7445608621306194
	Baseline	= 0.70072031491367
	Good!

Total minutes = 0.03696271578470866
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean temperature 4 days ahead with 24 hours of history:
	Training R^2	 = 0.7659931172092562

	Validation R^2	= 0.7721954584206873
	Baseline	= 0.7330725501841324
	Good!

Total minutes = 0.017307809988657632
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Forecasting mean temperature 1 days ahead with 24 hours of history:
	Training R^2	 = 0.8423682380852127

	Validation R^2	= 0.8490694870096535
	Baseline	= 0.8205323579311943
	Good!

Total minutes = 0.017730840047200522
