In [1]:
import pandas as pd #for data manipulation and analysis
import bs4 #library for pulling data out of HTML
import requests #will allow you to send HTTP/1.1 requests
from tqdm import tqdm #instantly make your loops show a smart progress meter
import numpy as np #for advanced mathematical operations 
import datetime #for manipulating dates and times in both simple and complex ways

In [2]:
#Main URL from where we would like to download data 
base_url = "http://case.doe.gov.bd/index.php?option=com_content&view=category&id=8&Itemid=32&limitstart="

If we take a look into the main page we will see the first page only shows 30 entries (URLs to main data) and we have to go to next page for next 30 days of data. Looking into a few page URLs you will see there is a pattern.    

• Page01 - http://case.doe.gov.bd/index.php?option=com_content&view=category&id=8&Itemid=32&limitstart= <br>
• Page02 - http://case.doe.gov.bd/index.php?option=com_content&view=category&id=8&Itemid=32&limitstart=30 <br>
• Page03 - http://case.doe.gov.bd/index.php?option=com_content&view=category&id=8&Itemid=32&limitstart=60 <br>
• Page67 - http://case.doe.gov.bd/index.php?option=com_content&view=category&id=8&Itemid=32&limitstart=1980 <br>

So we need to generate these URLs in a way so that we can loop through them and parse each page.

In [3]:
#each link has a pattern and a numerical value in it, hence making the numbers that will be used to create 
#the primary URLs later

first = 0
last = 1980
inc = 30
page_numbers = []
page_numbers.extend([i for i in range(first, last+inc, inc)])

In [4]:
#going to each primary page and collecting the secondary URLs

link_list = []

for item in tqdm(page_numbers):
    extraction_url = base_url+str(item) 
    
    r = requests.get(extraction_url)
    r.raise_for_status()
    
    page = bs4.BeautifulSoup(r.text, "html.parser")
    
    page_links = page.find_all('a')
    
    for li in page_links:
        if "Air Quality Index (AQI)_" in li.text:
            link_list.append("http://case.doe.gov.bd"+li.get('href'))

100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [01:14<00:00,  1.21s/it]


In [5]:
#total number of URLs
len(link_list)

1997

In [6]:
#this function will collect date from a page 
def find_date(cur_page):
    divs = cur_page.find_all("span")
    for item in divs:
        temp = item.text
        if temp.lower().strip().startswith("date"):
            return temp.strip()

In [7]:
#this function will collect the data from a table from a webpage, receives a url and send back the dataframe we need

def get_data_from_website(url):
    cols = ["Location","Air Quality Index (AQI)","AQI Category", "AQI Range","Date","URL"]
    df = pd.DataFrame(columns = cols)
    r = requests.get(url)
    r.raise_for_status()
    
    page = bs4.BeautifulSoup(r.text, "html.parser")
    
    for tr in page.find_all('tr'):
        tableDatas = tr.find_all("td")

        if len(tableDatas)>0:

            values = pd.Series(data = [item.text for item in tableDatas], index = cols[:-2])
            df.loc[len(df)] = values

    df['Date'] = find_date(page)
    df["URL"] = url
    
    return df

In [8]:
#making a list of dataframe from all the links using the function we just made
dfs = []
for link in tqdm(link_list):
    dfs.append(get_data_from_website(link))

100%|██████████████████████████████████████████████████████████████████████████████| 1997/1997 [16:48<00:00,  2.75it/s]


In [9]:
#Creating a final dataframe appending all the tables collected from each link
DF = pd.DataFrame()
for df in dfs:
    DF = DF.append(df)

In [10]:
len(DF)

18472

In [11]:
DF.head()

Unnamed: 0,Location,Air Quality Index (AQI),AQI Category,AQI Range,Date,URL
0,LOCATION,Air Quality Index (AQI),AQI CATEGORY,AQI RANGE,Date: 16/09/2019,http://case.doe.gov.bd/index.php?option=com_co...
1,DHAKAb,82,MODERATE,,Date: 16/09/2019,http://case.doe.gov.bd/index.php?option=com_co...
2,GAZIPURc,84,MODERATE,,Date: 16/09/2019,http://case.doe.gov.bd/index.php?option=com_co...
3,NARAYANGANJc,54,MODERATE,,Date: 16/09/2019,http://case.doe.gov.bd/index.php?option=com_co...
4,CHITTAGONGc,55,MODERATE,37-95,Date: 16/09/2019,http://case.doe.gov.bd/index.php?option=com_co...


### This merged dataset needs further cleaning since the whole data came up with some garbage values as well. 