In [1]:
# Reference: https://stackoverflow.com/questions/51756775/scraping-table-from-website-timeanddate-com
# Reference: https://stackoverflow.com/questions/29858752/error-message-chromedriver-executable-needs-to-be-available-in-the-path
# Reference: https://stackoverflow.com/questions/379906/how-do-i-parse-a-string-to-a-float-or-int-in-python


import re
import numpy as np
import pandas as pd
from datetime import datetime


from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.request import urlopen
from webdriver_manager.chrome import ChromeDriverManager


def str2float(value, as_int=False):
    try:
        number = float(re.sub('[^.\-\d]', '', value))
        if as_int:
            return int(number + 0.5)
        else:
            return number
    except ValueError:
        return float('nan')  # or None if you wish

In [2]:
df = None


month_limit = 2
year_incl_limit = 2019


for year in [2017, 2018, 2019]:
    for month in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
        if (month > month_limit) and (year >= year_incl_limit): break
        url = "https://www.timeanddate.com/weather/bangladesh/dhaka/historic?month={0}&year={1}".format(month, year)

        driver = webdriver.Chrome(ChromeDriverManager().install())
        driver.get(url); _d = {}

        for i in driver.find_element_by_id('wt-his-select').find_elements_by_tag_name('option'):
            
            tabledata = []
            
            i.click(); page = urlopen(url);
            print('Collecting data of :', i.text)
            
            if i.text == 'Previous 24 hours': continue
            soup = BeautifulSoup(page, "html.parser")

            table = soup.find('table', attrs = {'id' : 'wt-his'})
            for tr in table.find('tbody').find_all('tr'):
                
                dict = {}
                all_td = tr.find_all('td')
                dict['temp'] = all_td[1].text

                dict['wind'] = all_td[3].text
                dict['weather'] = all_td[2].text

                dict['humidity'] = all_td[5].text
                dict['barometer'] = all_td[6].text
                dict['visibility'] = all_td[7].text
                dict['timestamp'] = tr.find('th').text.strip()

                tabledata.append(dict)

            tmp_df = pd.DataFrame(tabledata)
            tmp_df['date_timestamp'] = i.text + ' ' + tmp_df['timestamp'].str[0:5]
            tmp_df['date_timestamp'] = pd.to_datetime(tmp_df['date_timestamp'])
            tmp_df.drop('timestamp', axis = 1, inplace = True)
            
            try:
                if sum(df.date_timestamp.dt.date.unique() == tmp_df.date_timestamp.dt.date.unique()[0]) < 1:
                    df = pd.concat([df, tmp_df])
            except: df = pd.concat([df, tmp_df])

    if (month > month_limit) and (year >= year_incl_limit): break
        

df.dropna(inplace = True)
df['visibility'] = df['visibility'].fillna('').apply(str2float)
df['barometer'] = df['barometer'].fillna('').apply(str2float)
df['humidity'] = df['humidity'].fillna('').apply(str2float)
df['temp'] = df['temp'].fillna('').apply(str2float)


df.sort_values(by = ['date_timestamp'], inplace = True, ascending = True)


df.head()
df.to_csv('dhaka_weather_data.csv')


Checking for linux64 chromedriver:2.45 in cache
Driver found in /home/ehsan/.wdm/chromedriver/2.45/linux64/chromedriver
Collecting data of : 1 January 2017
Collecting data of : 2 January 2017
Collecting data of : 3 January 2017
Collecting data of : 4 January 2017
Collecting data of : 5 January 2017
Collecting data of : 6 January 2017
Collecting data of : 7 January 2017
Collecting data of : 8 January 2017
Collecting data of : 9 January 2017
Collecting data of : 10 January 2017
Collecting data of : 11 January 2017
Collecting data of : 12 January 2017
Collecting data of : 13 January 2017
Collecting data of : 14 January 2017
Collecting data of : 15 January 2017
Collecting data of : 16 January 2017
Collecting data of : 17 January 2017
Collecting data of : 18 January 2017
Collecting data of : 19 January 2017
Collecting data of : 20 January 2017
Collecting data of : 21 January 2017
Collecting data of : 22 January 2017
Collecting data of : 23 January 2017
Collecting data of : 24 January 2017


Collecting data of : 31 July 2017

Checking for linux64 chromedriver:2.45 in cache
Driver found in /home/ehsan/.wdm/chromedriver/2.45/linux64/chromedriver
Collecting data of : 1 August 2017
Collecting data of : 2 August 2017
Collecting data of : 3 August 2017
Collecting data of : 4 August 2017
Collecting data of : 5 August 2017
Collecting data of : 6 August 2017
Collecting data of : 7 August 2017
Collecting data of : 8 August 2017
Collecting data of : 9 August 2017
Collecting data of : 10 August 2017
Collecting data of : 11 August 2017
Collecting data of : 12 August 2017
Collecting data of : 13 August 2017
Collecting data of : 14 August 2017
Collecting data of : 15 August 2017
Collecting data of : 16 August 2017
Collecting data of : 17 August 2017
Collecting data of : 18 August 2017
Collecting data of : 19 August 2017
Collecting data of : 20 August 2017
Collecting data of : 21 August 2017
Collecting data of : 22 August 2017
Collecting data of : 23 August 2017
Collecting data of : 24 Au

Collecting data of : 14 February 2018
Collecting data of : 15 February 2018
Collecting data of : 16 February 2018
Collecting data of : 17 February 2018
Collecting data of : 18 February 2018
Collecting data of : 19 February 2018
Collecting data of : 20 February 2018
Collecting data of : 21 February 2018
Collecting data of : 22 February 2018
Collecting data of : 23 February 2018
Collecting data of : 24 February 2018
Collecting data of : 25 February 2018
Collecting data of : 26 February 2018
Collecting data of : 27 February 2018
Collecting data of : 28 February 2018

Checking for linux64 chromedriver:2.45 in cache
Driver found in /home/ehsan/.wdm/chromedriver/2.45/linux64/chromedriver
Collecting data of : 1 March 2018
Collecting data of : 2 March 2018
Collecting data of : 3 March 2018
Collecting data of : 4 March 2018
Collecting data of : 5 March 2018
Collecting data of : 6 March 2018
Collecting data of : 7 March 2018
Collecting data of : 8 March 2018
Collecting data of : 9 March 2018
Col

Collecting data of : 14 September 2018
Collecting data of : 15 September 2018
Collecting data of : 16 September 2018
Collecting data of : 17 September 2018
Collecting data of : 18 September 2018
Collecting data of : 19 September 2018
Collecting data of : 20 September 2018
Collecting data of : 21 September 2018
Collecting data of : 22 September 2018
Collecting data of : 23 September 2018
Collecting data of : 24 September 2018
Collecting data of : 25 September 2018
Collecting data of : 26 September 2018
Collecting data of : 27 September 2018
Collecting data of : 28 September 2018
Collecting data of : 29 September 2018
Collecting data of : 30 September 2018

Checking for linux64 chromedriver:2.45 in cache
Driver found in /home/ehsan/.wdm/chromedriver/2.45/linux64/chromedriver
Collecting data of : 1 October 2018
Collecting data of : 2 October 2018
Collecting data of : 3 October 2018
Collecting data of : 4 October 2018
Collecting data of : 5 October 2018
Collecting data of : 6 October 2018
