In [None]:
# library needed for web crawling
!pip install BeautifulSoup4

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import time

# version downgrade needed for easier .npy file IO
!pip install numpy==1.16.1
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

import copy
from datetime import date

In [None]:
t_max=[]
t_min=[]
t_avg=[]
rain=[]
dates=[]

# this is where I downloaded the weather data from
webpage_base_addr='https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date='
webpage_test='https://www.metnet.hu/napi-adatok?sub=4&pid=10602&date=2017-10-25'

In [None]:
# I created an own html parser dedicated for metnet.hu

for year in range(2015,2020): # fetching data of last 5 years
  if year<2019:
    for month in range(1,13):      
      act_date=str(year)+'-'+str(month)+'-'+'1' 
      page_html=urlopen(webpage_base_addr+act_date) # the script rolls over the different addresses of the dates, addresses end to the specific date

      print(webpage_base_addr+act_date)

      soup = BeautifulSoup(page_html, 'html.parser') # parsed the whole html source
      js_part=soup.find_all('script')[7] # data was stored in JS arrays, so I had to find the JS part first, than extract the array from it
      js_text=js_part.get_text().replace('"','') # needed to make array conversion easier
      raw_data=re.findall(r"(?<=data: ).*?(?=,\n)", js_text)[:-2] # needed to fid the data JS part for value extracton
      print(raw_data)

      for i in range(len(raw_data)): # raw_data contains max, min, avg, rain values
        raw_data[i]=raw_data[i][1:-1].split(',') # splitting values
        raw_data[i]=[float(j) for j in raw_data[i]] # converting to number values

      t_max.append(raw_data[0])
      t_min.append(raw_data[1])
      t_avg.append(raw_data[2])
      rain.append(raw_data[3])
      print(act_date)
      
      for day in range(1,len(raw_data[0])+1):
        dates.append(str(year)+'-'+str(month)+'-'+str(day)) # created a date column, timestamp
      
  else: # this case handles the current year, we are only in October, so the loop rolling over the entire year in previous years would have given an error telling that metnet.hu doesnt have the values for the rest of the year
    for month in range(1,11):        
      act_date=str(year)+'-'+str(month)+'-'+'1'
      page_html=urlopen(webpage_base_addr+act_date)

      print(webpage_base_addr+act_date)

      soup = BeautifulSoup(page_html, 'html.parser')
      js_part=soup.find_all('script')[7]
      js_text=js_part.get_text().replace('"','')
      raw_data=re.findall(r"(?<=data: ).*?(?=,\n)", js_text)[:-2]
      print(raw_data)

      for i in range(len(raw_data)):
        raw_data[i]=raw_data[i][1:-1].split(',')
        raw_data[i]=[float(j) for j in raw_data[i]]

      t_max.append(raw_data[0])
      t_min.append(raw_data[1])
      t_avg.append(raw_data[2])
      rain.append(raw_data[3])
      print(act_date) 
      
      for day in range(1,len(raw_data[0])+1):
        dates.append(str(year)+'-'+str(month)+'-'+str(day))
        
# saving arrays to .npy files        
np.save('t_max',t_max)
np.save('t_min',t_min)
np.save('t_avg',t_avg)
np.save('rain',rain)
np.save('dates', dates)        

In [None]:
# loading back from .npy files
t_max=[item for sublist in np.load('t_max.npy') for item in sublist]
t_min=[item for sublist in np.load('t_min.npy') for item in sublist]
t_avg=[item for sublist in np.load('t_avg.npy') for item in sublist]
rain=[item for sublist in np.load('rain.npy') for item in sublist]
dates=np.load('dates.npy')

In [None]:
df=pd.DataFrame(list(zip(dates,t_max,t_min,t_avg,rain)), columns=['Date','T_max','T_min','T_avg','Rain']) # creating Pandas DataFrame

In [None]:
df.to_csv('weather_data.csv') # saving to .csv

In [None]:
df=pd.read_csv('weather_data.csv')

In [None]:
df.describe()

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.sequence import TimeseriesGenerator

from sklearn.preprocessing import StandardScaler

In [None]:
window_size = 5 # look-back window, I chose a size of 5 former values to predict a 6th one
generator = TimeseriesGenerator(df['T_avg'].values, df['T_avg'].values, length=window_size, batch_size=1) # the generator made the train set creation easier

In [None]:
# the generator assigns 5 former values to a 6th one
for i in range(len(generator)):
	x, y = generator[i]
	print('%s => %s' % (x, y))

In [None]:
# creating model
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=window_size))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [None]:
# training model
model.fit_generator(generator, steps_per_epoch=1, epochs=1000, verbose=1)

In [None]:
#test input
x_input = df['T_avg'][-1*window_size-1:-1]

In [None]:
x_input = x_input.values.reshape((1, window_size))
yhat = model.predict(x_input, verbose=1)

In [None]:
yhat

In [None]:
df_extend=copy.copy(df['T_avg'].values)
df_extend2=copy.copy(df['T_avg'].values)
df_extend3=copy.copy(df['T_avg'].values)

In [None]:
# dates wished to predict
date1 = date(2019, 10, 30)
date2 = date(2019, 11, 5)
date3 = date(2019, 11, 26)

# remaining days until the specific dates
days_left1=(date1-date.today()).days
days_left2=(date2-date.today()).days
days_left3=(date3-date.today()).days

In [None]:
# my main idea was to predict the average temperature from the past 5 days' average temperature
# that's why I needed to predict all the upcoming temperature values until the specific day, so I had to make predictions "days_left{n}" times
for day_pred in range(days_left1):
  generator = TimeseriesGenerator(df_extend, df_extend, length=window_size, batch_size=1) # generating updated training set
  model.fit_generator(generator, steps_per_epoch=1, epochs=200, verbose=1)
  df_input=copy.copy(df_extend[-1*window_size:])
  df_input=df_input.reshape((1, window_size))
  yhat = model.predict(df_input, verbose=1) # here comes the prediction
  print(yhat)
  df_extend=np.append(df_extend,yhat) # adding the predicted value to the training data

In [None]:
print('Predicted temperature for 30th of October: %s Celsius' % (np.round(df_extend[-1],2)))

In [None]:
# date2 prediction
for day_pred in range(days_left2):
  generator = TimeseriesGenerator(df_extend2, df_extend2, length=window_size, batch_size=1)
  model.fit_generator(generator, steps_per_epoch=1, epochs=200, verbose=1)
  df_input=copy.copy(df_extend2[-1*window_size:])
  df_input=df_input.reshape((1, window_size))
  yhat = model.predict(df_input, verbose=1)
  print(yhat)
  df_extend2=np.append(df_extend2,yhat)

In [None]:
np.save('model2',df_extend2)

In [None]:
print('Predicted temperature for 5th of November: %s Celsius' % (np.round(df_extend2[-1],2)))

In [None]:
# predicted values highighted in red
plt.plot(df_extend2[-20:])
plt.axvspan(20-days_left2,20, color='red', alpha=0.5)
plt.show()

In [None]:
# date3 prediction
for day_pred in range(days_left3):
  generator = TimeseriesGenerator(df_extend3, df_extend3, length=window_size, batch_size=1)
  model.fit_generator(generator, steps_per_epoch=1, epochs=200, verbose=1)
  df_input=copy.copy(df_extend3[-1*window_size:])
  df_input=df_input.reshape((1, window_size))
  yhat = model.predict(df_input, verbose=1)
  print(yhat)
  df_extend3=np.append(df_extend3,yhat)  

In [None]:
np.save('model3',df_extend3)

In [None]:
print('Predicted temperature for 26th of November: %s Celsius' % (np.round(df_extend3[-1],2)))

In [None]:
plt.plot(df_extend3[-50:])
plt.axvspan(50-days_left3,50, color='red', alpha=0.5)
plt.show()

# To be honest I found this task inappropriate for a neural network. The task could have been solved with other ML tools like decision trees a lot easier and more precisely. Basically the NN had to learn the trend of the weather time series which is possible, but comes with the difficulty of choosing the window size correctly. As far as I see the most important hyper-parameter here is the training window size as the network could easily become over fitted. In our case I chose a relatively small window of 5 days, on the plots it's clearly visible that the NN learnt the trend based on te last week which was quite warm. 