In [None]:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import numpy as np
import datetime
from datetime import datetime

In [None]:
#EXECUTE ONLY ONCE TO SCRAPE WEATHER DATA
driver = webdriver.Safari(executable_path ='/usr/bin/safaridriver')
df = pd.DataFrame(columns=['time', 'temp', 'dew_point', 'humidity', 'wind', 'wind_speed', 'wind_gust', 'pressure', 'precip', 'condition', 'date'])

for i in range(1,13):
    days_in_month = 0
    if i in [1,3,5,7,8,10,12]:
        days_in_month = 32
    elif i in [4,6,9,11]:
        days_in_month = 31
    elif i in [2]:
        days_in_month = 29
    for j in range(1,days_in_month):
        date = f'{j}.{i}.2017'
        print(date)
        url = f'https://www.wunderground.com/history/daily/KMDW/date/2017-{i}-{j}'
        driver.get(url)
        time.sleep(3)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', attrs={'class': 'mat-table cdk-table mat-sort ng-star-inserted' })
        table_rows = table.find_all('tr')
        l = []
        for tr in table_rows:       
            td = tr.find_all('td')
            row = [tr.text for tr in td]
            if row:
                row.append(f'{j}.{i}.2017')
            l.append(row)
            l = list(filter(None, l))
        for k in l:
            df.loc[len(df)] = k
            
            

driver.quit()
      


In [None]:
df.tail()

In [None]:
#save scraped data in csv
df.to_csv('./data/weather_data_date.csv', index = False)

In [None]:
#load scraped data for further processing
df = pd.read_csv('./data/weather_data_date.csv')

In [None]:
#filter out all datapoints that have been collected outside of the hourly (XX:53 AM/PM) weather measurements 
tuple = ('53 AM', "53 PM")
df_hourly_measurement = df[df['time'].str.endswith(tuple)]
len(df_hourly_measurement)

In [None]:
df_hourly_measurement = df_hourly_measurement.reset_index()

In [None]:
df_hourly_measurement['time'] = pd.to_datetime(df_hourly_measurement['time'], format='%I:%M %p').dt.strftime('%H:%M')
df_hourly_measurement['date'] = pd.to_datetime(df_hourly_measurement['date'], format='%d.%m.%Y').dt.strftime('%Y-%m-%d')
date = df_hourly_measurement['date']
df_hourly_measurement.drop(labels=['date'], axis=1,inplace = True)
df_hourly_measurement.insert(1, 'date', date)
df_hourly_measurement.drop('index', axis=1, inplace = True)
df_hourly_measurement.head()

In [None]:
#Identify Duplicates
duplicate_counts = df_hourly_measurement.groupby(['date', 'time']).size().reset_index(name='count')
duplicate_counts = duplicate_counts[duplicate_counts['count'] > 1]

print(duplicate_counts)

In [None]:
#Drop Duplicates
df_hourly_measurement.drop_duplicates(subset=['date','time'], keep='first', inplace=True)

In [None]:
#Check for duplicates again, to see if it worked
duplicate_counts = df_hourly_measurement.groupby(['date', 'time']).size().reset_index(name='count')
duplicate_counts = duplicate_counts[duplicate_counts['count'] > 1]
print(duplicate_counts)

In [None]:
#Sort array, since 00:53 was always on the end of a day, even if it has to be first
# Reset index since after sorting it might change 
df_hourly_measurement = df_hourly_measurement.sort_values(by=['date', 'time'])
df_hourly_measurement.reset_index(inplace = True, drop = True)
df_hourly_measurement.head()

In [None]:
print('Number of missing Hours: ',(365*24)- len(df_hourly_measurement))

In [None]:
# Create reference dataframes for date and time: They contain 24 hours for each day and will be used to compare with our dataframe 
# Thus helping to identify missing hours

ref_dates = pd.date_range(start = "2017-01-01", end = "2017-12-31", freq = 'D')
ref_times = pd.date_range(start = "00:53", end = "23:53", freq = 'H').time
#Create 24 date-time couples, for every date
ref_df = pd.DataFrame([(date,time) for date in ref_dates for time in ref_times], columns =['date', 'time'])
#convert columns for later combination
ref_df['time'] = ref_df['time'].astype(str)
ref_df['date'] = ref_df['date'].dt.strftime('%Y-%m-%d')
ref_df['date_time'] = pd.to_datetime(ref_df['date'] + ' ' + ref_df['time'])
ref_df.head()

In [None]:
df_hourly_measurement['date_time'] = pd.to_datetime(df_hourly_measurement['date'] + df_hourly_measurement['time'], format='%Y-%m-%d%H:%M')
df_hourly_measurement.head()

In [430]:
#convert date_time columns of df_hourly_measurement and ref_df into a list to easily get missing values using .difference function of list
current_dt = df_hourly_measurement['date_time'].tolist()
expected_dt = ref_df['date_time'].tolist()
missing_dt = list(set(expected_dt).difference(current_dt))
df_missing = pd.DataFrame({'date_time': missing_dt})
print(df_missing)

             date_time
0  2017-10-29 00:53:00
1  2017-11-19 13:53:00
2  2017-10-28 20:53:00
3  2017-11-17 20:53:00
4  2017-08-17 04:53:00
5  2017-08-17 03:53:00
6  2017-08-17 01:53:00
7  2017-08-17 00:53:00
8  2017-06-18 23:53:00
9  2017-07-18 04:53:00
10 2017-08-17 02:53:00
11 2017-03-25 23:53:00
12 2017-10-02 23:53:00
13 2017-03-12 00:53:00
14 2017-06-18 22:53:00
15 2017-03-26 02:53:00


In [465]:
# merge df_hourly_measurement together to obtain a dataframe that contains all hours, but still needs to fill all other columns
df_merged = pd.merge(df_hourly_measurement, df_missing, on='date_time', how='outer')
df_merged.sort_values(by='date_time', inplace=True)
df_merged.reset_index(inplace = True, drop = True)
#df_merged.head()

In [470]:
# get all rows that have NaN values => all, which only have the date_time defined are the added missing values
nan_rows = df_merged[df_merged.isna().any(axis=1)]
print(nan_rows)

#there are rows with one NaN
nan_counts = df_merged.isna().sum(axis=1)
inserted_rows = df_merged[nan_counts > 1]

nan_indices = inserted_rows.index


rows_to_view = df.iloc[np.concatenate([nan_indices - 1, nan_indices, nan_indices + 1])]

print(rows_to_view)

            date   time   temp dew_point humidity wind wind_speed wind_gust  \
178   2017-01-08  10:53   9 °F     -6 °F    51 °%  NaN     0 °mph    0 °mph   
180   2017-01-08  12:53  14 °F     -6 °F    41 °%  NaN     0 °mph    0 °mph   
1608  2017-03-09  00:53  37 °F     17 °F    44 °%  NaN     0 °mph    0 °mph   
1680         NaN    NaN    NaN       NaN      NaN  NaN        NaN       NaN   
2012  2017-03-25  20:53  47 °F     46 °F    97 °%  NaN     0 °mph    0 °mph   
2015         NaN    NaN    NaN       NaN      NaN  NaN        NaN       NaN   
2018         NaN    NaN    NaN       NaN      NaN  NaN        NaN       NaN   
2183  2017-04-01  23:53  45 °F     21 °F    39 °%  NaN     0 °mph    0 °mph   
4054         NaN    NaN    NaN       NaN      NaN  NaN        NaN       NaN   
4055         NaN    NaN    NaN       NaN      NaN  NaN        NaN       NaN   
4756         NaN    NaN    NaN       NaN      NaN  NaN        NaN       NaN   
5472         NaN    NaN    NaN       NaN      NaN  N