In [1]:
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [58]:
df = pd.read_excel('Wunderground_West_Lafayette_2010_Clean.xlsx')
df.head()
n_hist = 2
n_pred = 7
target = 'temp_F_avg'

In [59]:
def n_days(df,n_hist,n_pred,target):
    n = n_hist+n_pred
    temp = df.iloc[:,2:]
    df_ret = temp.copy().iloc[:-n+1,:]
    df_ret.columns = df_ret.columns+'_1'
    
    for i in range(1,n_hist):
        day = temp.iloc[i:-(n-i)+1,:].reset_index(drop=True)
        day.columns = day.columns+'_'+str(i+1)
        df_ret = pd.concat([df_ret,day],axis=1)
        
    for i in range(n_hist,n-1):
        target_col = pd.DataFrame(temp.loc[i:temp.shape[0]-(n-i),target]).reset_index(drop=True)
        target_col.columns = [target_col.columns[0]+'_target_'+str(i+1-n_hist)]
        df_ret = pd.concat([df_ret,target_col],axis=1)
    target_col = pd.DataFrame(temp.loc[n-1:,target]).reset_index(drop=True)
    target_col.columns = [target_col.columns[0]+'_target_'+str(n_pred)]
    df_ret = pd.concat([df_ret,target_col],axis=1)
    
    return df_ret

In [60]:
df1 = n_days(df,n_hist,n_pred,target)
df1.head()

Unnamed: 0,temp_F_max_1,temp_F_avg_1,temp_F_min_1,dew_point_F_max_1,dew_point_F_avg_1,humidity_percent_max_1,humidity_percent_avg_1,humidity_percent_min_1,wind_speed_mph_max_1,wind_speed_mph_avg_1,...,pressure_Hg_max_2,pressure_Hg_avg_2,pressure_Hg_min_2,temp_F_avg_target_1,temp_F_avg_target_2,temp_F_avg_target_3,temp_F_avg_target_4,temp_F_avg_target_5,temp_F_avg_target_6,temp_F_avg_target_7
0,15,10.2,6,7,3.6,80,74.9,68,14,10.1,...,29.8,29.8,29.7,9.4,11.7,12.0,13.8,19.1,23.7,12.7
1,12,6.9,0,3,-0.2,80,72.9,64,17,8.7,...,29.8,29.8,29.7,11.7,12.0,13.8,19.1,23.7,12.7,9.2
2,17,9.4,-1,7,1.6,80,71.0,59,13,7.3,...,29.7,29.6,29.6,12.0,13.8,19.1,23.7,12.7,9.2,23.3
3,16,11.7,7,9,5.9,85,77.4,73,15,10.5,...,29.6,29.6,29.5,13.8,19.1,23.7,12.7,9.2,23.3,27.4
4,18,12.0,6,11,6.3,85,77.5,73,15,11.0,...,29.6,29.6,29.6,19.1,23.7,12.7,9.2,23.3,27.4,25.4


In [61]:
df_train, df_test = train_test_split(df1,test_size=0.1,random_state=1)
X_train = df_train.iloc[:,:-n_pred].reset_index(drop=True)
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train-X_train.mean())/X_train.std()

y_train = df_train.iloc[:,-n_pred:].reset_index(drop=True)

X_test = df_test.iloc[:,:-n_pred].reset_index(drop=True)
X_test_mean = X_test.mean()
X_test_std = X_test.std()
X_test = (X_test-X_test.mean())/X_test.std()

y_test = df_test.iloc[:,-n_pred:].reset_index(drop=True)

In [62]:
MLPR = MLPRegressor(activation='identity',solver='lbfgs',learning_rate='adaptive',verbose=False,tol=1e-6,
                    max_iter=1e3,hidden_layer_sizes=(100,))

In [64]:
for i in range(n_pred):
    MLPR.fit(X_train,y_train.iloc[:,i])
    score = MLPR.score(X_test,y_test.iloc[:,i])
    print('Forecasting',target,i+1,'days ahead with',n_hist,'days of history:')
    print('\tR^2\t\t=',score)
    
    baseline_score = r2_score((y_test.iloc[:,i]-X_test_mean.loc[target+'_'+str(n_hist)])/X_test_std.loc[target+'_'+str(n_hist)],
                          X_test.loc[:,target+'_'+str(n_hist)])
    print('\tBaseline\t=',baseline_score)
    if score > baseline_score:
        print('\tGood!')
    else:
        print('\tBad!')

Forecasting temp_F_avg 1 days ahead with 2 days of history:
	R^2		= 0.8447373791764776
	Baseline	= 0.8418055003993304
	Good!
Forecasting temp_F_avg 2 days ahead with 2 days of history:
	R^2		= 0.7740556608143783
	Baseline	= 0.7323228633260281
	Good!
Forecasting temp_F_avg 3 days ahead with 2 days of history:
	R^2		= 0.7652597416990539
	Baseline	= 0.7258770919585871
	Good!
Forecasting temp_F_avg 4 days ahead with 2 days of history:
	R^2		= 0.824945027348181
	Baseline	= 0.8180116118421895
	Good!
Forecasting temp_F_avg 5 days ahead with 2 days of history:
	R^2		= 0.7490764081915607
	Baseline	= 0.7484594437411815
	Good!
Forecasting temp_F_avg 6 days ahead with 2 days of history:
	R^2		= 0.6953087033815893
	Baseline	= 0.731952297453305
	Bad!
Forecasting temp_F_avg 7 days ahead with 2 days of history:
	R^2		= 0.7305020152993636
	Baseline	= 0.7395619642134557
	Bad!


In [53]:
y_test.iloc[:,0].head()
X_test_mean.loc[target+'_'+str(n_hist)]

59.094285714285704

In [54]:
i=0
baseline_score = r2_score((y_test.iloc[:,i]-X_test_mean.loc[target+'_'+str(n_hist)])/X_test_std.loc[target+'_'+str(n_hist)],
                          X_test.loc[:,target+'_'+str(n_hist)])
baseline_score

0.9017459280379531