In [1]:
import bs4 as bs
import urllib.request
import pandas as pd
import requests
from requests.exceptions import HTTPError
import os

from csv import DictReader, DictWriter

In [2]:
urls = ['https://geographic.org/global_weather/pakistan/hyderabad_airport_417640_99999.html']

def url_checker(url):
    """This check the existence of site
    Parameter
    ---------
    url: URL: str
    
    Return 
    ------
    200: if site exist
    None: if site doesn't exist.
    """
    if requests.get(url).status_code == 200:
        return 200
    return None

def certified_links(urls):
    """This return .csv file that contain Year, Month and link of site that contain data.
    
    Parameter
    ---------
    urls: Urls: list
    
    Return
    ------
    None:
    * Save data into .csv file.
    """
    months = {
        "January": "01",
        "February": "02",
        "March": "03",
        "April": "04",
        "May": "05",
        "June": "06",
        "July": "07",
        "August": "08",
        "September": "09",
        "October": "10",
        "November": "11",
        "December": "12"
    }
    
    for url in urls:
        source = urllib.request.urlopen(url).read()   # HTML
        soup = bs.BeautifulSoup(source,'lxml')        # LXML 
        for year in soup.find_all( id="selectBox",class_="year"):  # grep selectbox of year
            years = [year for year in year.text.split("\n") if len(year) == 4]  # This contain only available years  
        
        target = r'/home/hamza/Desktop/LEARNING/webScraping/data'  # Target to save file
        file_name = "Hyderabad_Airport.csv"                        # File name
        with open(f"{target}/{file_name}", 'w', newline = '') as f:   # newline -> instead of '\n'
            burnData = DictWriter(f, fieldnames = ['year', 'month', 'link', 'status'])  # header
            burnData.writeheader()
            
            # Getting links from available years
            for year in years:  
                varified_links = []
                # Months are not confirmed; Therefore; Check for the existance of each month by looping  
                # monthKey = "January", monthValue = "01"
                for monthKey, monthValue in months.items(): 
                    link = f"https://geographic.org/global_weather/weather_data_2.php?month={monthValue}&year={year}&id=417640-99999&path=weather_stations/415730_471962/417640-99999.txt&name=Hyderabad%20Airport&country=Pakistan"
                    if url_checker(link) == 200:  # Existance of site
                        print(f"{year}: {monthKey}")
                        burnData.writerows([
                                {'year': year, 'month': monthKey, 'link': link, 'status': 'Undone'},            
                            ])
                
        f.close()
            
            
# certified_links(urls)

2016: January
2016: February
2016: March
2016: April
2016: May
2016: June
2016: July
2016: Augst
2016: September
2016: October
2016: November
2016: December
2015: January
2015: February
2015: March
2015: April
2015: May
2015: June
2015: July
2015: Augst
2015: September
2015: October
2015: November
2015: December
2014: January
2014: February
2014: March
2014: April
2014: May
2014: June
2014: July
2014: Augst
2014: September
2014: October
2014: November
2014: December
2013: January
2013: February
2013: March
2013: April
2013: May
2013: June
2013: July
2013: Augst
2013: September
2013: October
2013: November
2013: December
2012: January
2012: February
2012: March
2012: April
2012: May
2012: June
2012: July
2012: Augst
2012: September
2012: October
2012: November
2012: December
2011: January
2011: February
2011: March
2011: April
2011: May
2011: June
2011: July
2011: Augst
2011: September
2011: October
2011: November
2011: December
2010: January
2010: February
2010: March
2010: April
2010:

In [3]:
def fetchTo_xlsx():
    readFile = writeFile= r'/home/hamza/Desktop/LEARNING/webScraping/data/Hyderabad_Airport.csv'
    months = {
        "January": "01",
        "February": "02",
        "March": "03",
        "April": "04",
        "May": "05",
        "June": "06",
        "July": "07",
        "August": "08",
        "September": "09",
        "October": "10",
        "November": "11",
        "December": "12"
    }

    d_dfs = {}
    with open(readFile) as rf:
        csvRead = DictReader(rf)
        for row in csvRead:
            year, month, link = row['year'], row['month'], row['link']
            writer = pd.ExcelWriter(f'{year}.xlsx', engine='xlsxwriter')
            try:
                dfs = pd.read_html(link, header=0)
            except:
                None
            else:
                print(f"{year} : {month}")
                for df in dfs:
                    d_dfs[month] = df
                for sheet_name in d_dfs.keys():
                    d_dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
                writer.save()
        writer.close()

# fetchTo_xlsx()

In [None]:
class Crawler:
    def __init__(place_url):
        self.place_url = place_url
        
        self.months = {
            "January": "01",
            "February": "02",
            "March": "03",
            "April": "04",
            "May": "05",
            "June": "06",
            "July": "07",
            "Augst": "08",
            "September": "09",
            "October": "10",
            "November": "11",
            "December": "12"
            }
        
    def mkdir(self): 
        os.makedirs(r"data")  
        os.chdir(r'data') 
    
    def url_checker(self, url):
        """This check the existence of site
        Parameter
        ---------
        url: URL: str

        Return 
        ------
        200: if site exist
        None: if site doesn't exist.
        """
        if requests.get(url).status_code == 200:
            return 200
        return None
        
        

    def certified_links(self):
        """This return .csv file that contain Year, Month and link of site that contain data.

        Parameter
        ---------
        urls: Urls: list

        Return
        ------
        None:
        * Save data into .csv file.
        """
        for url in self.place_url:
            source = urllib.request.urlopen(url).read()   # HTML
            soup = bs.BeautifulSoup(source,'lxml')        # LXML 
            for year in soup.find_all( id="selectBox",class_="year"):  # grep selectbox of year
                years = [year for year in year.text.split("\n") if len(year) == 4]  # This contain only available years  

#             target = r'/home/hamza/Desktop/LEARNING/webScraping/data'  # Target to save file
            file_name = "Hyderabad_Airport.csv"                        # File name
            with open(file_name, 'w', newline = '') as f:   # newline -> instead of '\n'
                burnData = DictWriter(f, fieldnames = ['year', 'month', 'link', 'status'])  # header
                burnData.writeheader()

                # Getting links from available years
                for year in years:  
                    varified_links = []
                    # Months are not confirmed; Therefore; Check for the existance of each month by looping  
                    # monthKey = "January", monthValue = "01"
                    for monthKey, monthValue in self.months.items(): 
                        link = f"https://geographic.org/global_weather/weather_data_2.php?month={monthValue}&year={year}&id=417640-99999&path=weather_stations/415730_471962/417640-99999.txt&name=Hyderabad%20Airport&country=Pakistan"
                        if url_checker(link) == 200:  # Existance of site
                            print(f"{year}: {monthKey}")
                            burnData.writerows([
                                    {'year': year, 'month': monthKey, 'link': link, 'status': 'Undone'},            
                                ])
                            
    


In [None]:
urls = ['https://geographic.org/global_weather/pakistan/hyderabad_airport_417640_99999.html']
