In [81]:
# import necessary libraries
import pandas as pd
import requests
import csv
import re

from bs4 import BeautifulSoup



In [82]:
# web scrape Napa County url and get its html properties 

url = 'https://www.countyofnapa.org/Archive.aspx?AMID=39'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html')
tables = soup.find_all('table')



# variable to save our pdf_links to extract info
pdf_links_napa = []
valid_years_napa = []
# open csv file and write in our links 

with open('farm_smoke.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    headers = ["County", "Year","Links"]
    writer.writerow(headers)

    # index by the third table-tag 
    # first three were not relevant links 

    for table in tables[3:]:
        
        table_info = table.find('span', attrs={'class':'archive'})
        year_tag = table_info.find('span').text

        # if text does not contains "Spanish"
        if "Spanish" not in year_tag:

            # index first four digits
            valid_year = year_tag[:4]
            valid_years_napa.append(valid_year)

            # formulate the entire link
            pdf_link = 'https://www.countyofnapa.org/' + table_info.find('a').get('href')
            pdf_links_napa.append(pdf_link)

            # write into csv of web information
            writer.writerow(['Napa', valid_year, pdf_link])


# len(valid_years_napa)
# len(pdf_links_napa)

# quick math calculation 2014 - 1921 = 93; +1 for inclusive 
# starting in 2015, there is Spanish + English so 2 * 7 = 14 
# This give us 94 + 14 = 108 links 
# 108 - 7 = 101 links



In [83]:
# web scrape Mendocino County url and get its html properties 

url = 'https://www.mendocinocounty.org/government/agriculture'
response = requests.get(url)

# check status code for accessbility
# print(response.status_code)

soup = BeautifulSoup(response.text, 'html')

tables = soup.find('table', attrs={'class':'subtitle'}).find_all('tr')

# # variable to save our pdf_links to extract info

pdf_links_mendocino = []
valid_years_mendocino = []

# open csv file and edit our file 

with open('farm_smoke.csv', 'a', newline='') as f:
    writer = csv.writer(f)

    # for every table properties: extract years text and pdf link
    for table in tables[1:]:
        for link in table.find_all('a', href=True):
            valid_year = link.text 
            pdf_link = link.get('href')

            valid_years_mendocino.append(valid_year)
            pdf_links_mendocino.append(pdf_link)
            writer.writerow(['Mendocino', valid_year, pdf_link])

            if valid_year == '2000':
                print('Added 1999 missing link')
                writer.writerow(['Mendocino', 1999, 'Link not exist'])



# len(valid_years_mendocino)
# len(pdf_links_mendocino)

# quick math calculation 2021 - 1985 = 36; +1 for inclusive 
# 1999 link does not exist; 37 - 1 = 36
# This give us 36 links

Added 1999 missing link


In [84]:
# web scrape Sonoma County url and get its html properties 

url = 'https://sonomacounty.ca.gov/natural-resources/agricultural-weights-and-measures/crop-reports'
response = requests.get(url)

# check status code for accessbility
# print(response.status_code)

soup = BeautifulSoup(response.text, 'html')
links = soup.find('div', attrs={'class':'formatted-list'}).find_all('li', attrs={'class':'list-item-icon'})
year_pattern = re.compile(r'\d{4}') 

# variable to save our pdf_links to extract info

pdf_links_sonoma = []
valid_years_sonoma = []

# open csv file and edit our file 

with open('farm_smoke.csv', 'a', newline='') as f:
    writer = csv.writer(f)

    # for every table properties: extract years text and pdf link
    # reverse the links starting from descending years
    # skip the first link: '2021-Sonoma-County-Crop-Report-Addendum'

    for link in list(reversed(links[:-1])):
        # extract first four year digits 
        pdf_title = link.find('span', attrs={'class':'name'}).text
        year_match = year_pattern.search(pdf_title)
        if year_match: # if there exist a year match 
            valid_year = year_match.group()
            valid_years_sonoma.append(valid_year)

        # extract pdf links 
        pdf_link = 'https://sonomacounty.ca.gov/' + link.find('a')['href']
        pdf_links_sonoma.append(pdf_link)

        writer.writerow(['Sonoma', valid_year, pdf_link])


# len(valid_years_sonoma)
# len(pdf_links_sonoma)

# quick math calculation 2021 - 1928 = 93; +1 for inclusive; excluding 2021 addendum
# This give us 94 links
         