## Using web scraping techniques to pull information from a weather website for data analysis.

In [28]:
#import libraries for webscraping
from bs4 import BeautifulSoup as bsoup
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import datetime
import re

#### Setting the service object and initializing the driver.

In [29]:
# Path to the ChromeDriver executable
chrome_driver_path = "/Users/neelaropp/.wdm/drivers/chromedriver/mac64/127.0.6533.72/chromedriver-mac-arm64/chromedriver"

service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Navigate to the weather.com website
driver.get('https://www.weather.com')

# Create BeautifulSoup object from the page source
soup = bsoup(driver.page_source, 'lxml')

# Close the browser window after the html for the website has been loaded to jupyter notebook
driver.quit()

In [30]:
# Specify the path to the ChromeDriver executable. 
chrome_driver_path = "/Users/neelaropp/.wdm/drivers/chromedriver/mac64/127.0.6533.72/chromedriver-mac-arm64/chromedriver"

# Create a service object to manage the ChromeDriver
service = Service(chrome_driver_path)

# Initialize the Chrome WebDriver instance using the service
driver = webdriver.Chrome(service=service)

try:
    # Step 1: Open the Weather.com homepage
    driver.get('https://www.weather.com')
    print("Successfully opened the weather.com website.")

    # Step 2: Define the search string 
    search_string = 'USA, 80208'

    # Step 3: Use WebDriverWait to wait until the location search input field becomes clickable
    print("Waiting for the search input field to be clickable...")
    zipsearch = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "LocationSearch_input")))
    print("Search input field found. Sending search string...")

    # Step 4: Send the search string  to the input field
    zipsearch.send_keys(search_string)

    # Step 5: After sending the search string, allow time for the dropdown suggestions to load
    print("Waiting for search results to load...")
    sleep(5)  # Adjust the sleep time based on how long it takes for the suggestions to load

    # Step 6: Once the results are loaded, use WebDriverWait again to wait for the specific search result
    print("Waiting for the 'Denver, CO' search result to be clickable...")
    denverPage = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, "//li[contains(text(), 'Denver, CO')]"))
    )

    # Step 7: Once the 'Denver, CO' option is found, click on it
    print("'Denver, CO' found. Clicking on the result...")
    denverPage.click()

    # Step 8: If everything goes as planned, this message will print
    print("Successfully clicked the 'Denver, CO' search result.")

except Exception as e:
    # Catch any exceptions that occur during the process and print the error message
    print(f"An error occurred: {e}")

finally:
    # Step 9: Close the browser session after the operations are complete
    driver.quit()
    print("Browser session closed.")


Successfully opened the weather.com website.
Waiting for the search input field to be clickable...
Search input field found. Sending search string...
Waiting for search results to load...
Waiting for the 'Denver, CO' search result to be clickable...
An error occurred: Message: 
Stacktrace:
0   chromedriver                        0x0000000101430ee8 cxxbridge1$str$ptr + 1871728
1   chromedriver                        0x000000010142954c cxxbridge1$str$ptr + 1840596
2   chromedriver                        0x000000010103c82c cxxbridge1$string$len + 88532
3   chromedriver                        0x0000000101080838 cxxbridge1$string$len + 367072
4   chromedriver                        0x00000001010b8490 cxxbridge1$string$len + 595512
5   chromedriver                        0x0000000101075478 cxxbridge1$string$len + 321056
6   chromedriver                        0x00000001010760e8 cxxbridge1$string$len + 324240
7   chromedriver                        0x00000001013f89fc cxxbridge1$str$ptr + 1641

#This link will be used to complete the webscraping to notice patterns in a months worth of weather. This data contains the following metrics: day, weather, temperature, lunar cycle, date, time of sunrise and sunset, and wind. 

https://weather.com/weather/monthly/l/39.7393,-104.9844

In [31]:
# Specify lat/lon coordinates
locations = (39.7392, -104.9902)  # Example coordinates for Denver, CO

# Construct the URL
base_url = "https://weather.com/weather/monthly/l/"
temp_url = base_url + str(locations[0]) + ',' + str(locations[1])

# Make the HTTP request
response = requests.get(temp_url)

In [32]:
response.status_code

200

In [33]:
soup = bsoup(response.text, 'lxml')

In [34]:
print(soup.prettify()[:500])

<!DOCTYPE html>
<html dir="ltr" lang="en-US">
 <head>
  <meta charset="utf-8" data-react-helmet="true"/>
  <meta content="width=device-width, initial-scale=1, viewport-fit=cover" data-react-helmet="true" name="viewport"/>
  <meta content="max-image-preview:large" data-react-helmet="true" name="robots"/>
  <meta content="index, follow" data-react-helmet="true" name="robots"/>
  <meta content="origin" data-react-helmet="true" name="referrer"/>
  <meta content="Weather.com brings you the most accur


In [35]:
import re

cal = soup.find_all('button', {'class': re.compile('Button--default--2gfm1 CalendarDateCell*')})
cal

[<button class="Button--default--2gfm1 CalendarDateCell--dayCell--3ED7m CalendarDateCell--firstCell--1OWO_ CalendarDateCell--firstRow--1a9Vd" data-id="calendar-9/1" data-testid="ctaButton" type="button"><div class="CalendarDateCell--iconAstro--3eAnw CalendarDateCell--small--11Prc"><svg class="Icon--icon--2aW0V Icon--fullTheme--3Fc-5" data-testid="Icon" height="1024" name="phase-28" set="astro" theme="full" viewbox="0 0 1024 1024" width="1024"><title>Moon Phase - Day 28</title><path d="M516.795 57.54q-190.202 0-325.528 135.326T55.941 518.394t135.326 325.528 325.528 135.326q-173.686 0-297.557-135.326T95.367 518.394t123.871-325.528T516.795 57.54zm0 956.87q-205.12 0-350.835-145.715T20.245 518.393 165.96 167.825 516.795 21.844q204.587 0 350.302 145.981t145.715 350.568-145.715 350.302-350.302 145.715z"></path></svg></div><span class="CalendarDateCell--date--JO3Db">1</span><div class="CalendarDateCell--icon--dA6Pp"><svg class="Icon--icon--2aW0V Icon--fullTheme--3Fc-5" data-testid="Icon" name=

#### Below we're printing out the information comprising a single calendar day.

In [36]:
single_day = soup.find_all('button', {'class': re.compile('Button--default--2gfm1 CalendarDateCell*')})
print(single_day[0])

<button class="Button--default--2gfm1 CalendarDateCell--dayCell--3ED7m CalendarDateCell--firstCell--1OWO_ CalendarDateCell--firstRow--1a9Vd" data-id="calendar-9/1" data-testid="ctaButton" type="button"><div class="CalendarDateCell--iconAstro--3eAnw CalendarDateCell--small--11Prc"><svg class="Icon--icon--2aW0V Icon--fullTheme--3Fc-5" data-testid="Icon" height="1024" name="phase-28" set="astro" theme="full" viewbox="0 0 1024 1024" width="1024"><title>Moon Phase - Day 28</title><path d="M516.795 57.54q-190.202 0-325.528 135.326T55.941 518.394t135.326 325.528 325.528 135.326q-173.686 0-297.557-135.326T95.367 518.394t123.871-325.528T516.795 57.54zm0 956.87q-205.12 0-350.835-145.715T20.245 518.393 165.96 167.825 516.795 21.844q204.587 0 350.302 145.981t145.715 350.568-145.715 350.302-350.302 145.715z"></path></svg></div><span class="CalendarDateCell--date--JO3Db">1</span><div class="CalendarDateCell--icon--dA6Pp"><svg class="Icon--icon--2aW0V Icon--fullTheme--3Fc-5" data-testid="Icon" name="

# We will parse the weather data for each day using BeautifulSoup (bs4) or by converting elements to strings 
# and using string methods to extract the required information.

# Our goal is to extract and organize the following information for each day into a Pandas DataFrame:
# - month: Extract the month component from the date (formatted as month/day, e.g., 9/25).
# - day: Extract the numerical day of the month.
# - moon_phase: Retrieve the lunar phase information, formatted as "Day n".
# - weather_desc: Describe the weather conditions (e.g., mostly cloudy, sunny, rain, etc.).

# Additionally, we will compute a new feature, 'day', representing the day of the week (e.g., Monday, Tuesday, etc.).
# This information is not directly available in the data but can be derived from the date. We will keep the 
# 'weather_desc' feature as part of the DataFrame.


In [37]:
import requests
from bs4 import BeautifulSoup as bsoup
import re
import pandas as pd
from datetime import datetime, timedelta

# Specify lat/lon coordinates
locations = (39.7392, -104.9902)  # Example coordinates for Denver, CO

# Construct the URL
base_url = "https://weather.com/weather/monthly/l/"
temp_url = base_url + str(locations[0]) + ',' + str(locations[1])

# Make the HTTP request
response = requests.get(temp_url)

if response.status_code == 200:
    print("Request was successful!")
else:
    print(f"Request failed with status code {response.status_code}")

# Parse the response content
soup = bsoup(response.text, 'lxml')

# Extract the calendar data
cal = soup.find_all('button', {'class': re.compile('Button--default--2gfm1 CalendarDateCell*')})

# Initialize lists to store the extracted data
months = []
days = []
moon_phases = []
weather_descs = []
day_names = []

# Dictionary to map moon phase classes to names
moon_phase_mapping = {
    'phase-24': 'Waning Crescent',
    'phase-25': 'Waning Crescent',
    'phase-26': 'Waning Crescent',
    'phase-27': 'New Moon',
    'phase-28': 'Waxing Crescent',
    'phase-29': 'Waxing Crescent',
    'phase-1': 'New Moon',
    'phase-2': 'Waxing Crescent',
    'phase-3': 'Waxing Crescent',
    'phase-4': 'First Quarter',
    'phase-5': 'Waxing Gibbous',
    'phase-6': 'Waxing Gibbous',
    'phase-7': 'Full Moon',
    'phase-8': 'Waning Gibbous',
    'phase-9': 'Waning Gibbous',
    'phase-10': 'Third Quarter',
    'phase-11': 'Waning Crescent',
    'phase-12': 'Waning Crescent',
    'phase-13': 'Waning Crescent',
    'phase-14': 'New Moon',
    'phase-15': 'Waxing Crescent',
    'phase-16': 'Waxing Crescent',
}

# Extract the day of the week from the calendar headers
headers = soup.find_all('div', {'class': 'CalendarDateCell--dayOfWeek--1arV3'})
weekdays = [header.text.strip() for header in headers]

# Fallback day names starting from Sunday
fallback_day_names = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# Ensure we have the correct number of weekday headers
if len(weekdays) != 7:
    print(f"Unexpected number of weekday headers: {len(weekdays)}. Using fallback day names.")
    weekdays = fallback_day_names

# Shift the moon phase cycle by 2 days
moon_phase_classes = list(moon_phase_mapping.keys())
shifted_moon_phase_classes = moon_phase_classes[-2:] + moon_phase_classes[:-2]

# Initialize a variable to track the moon phase index
moon_phase_index = 0

manual_weather_descs = [
    "Partly Cloudy", "Scattered Showers", "Partly Cloudy", "Mostly Sunny", "Mostly Sunny", "Partly Cloudy",
    "Mostly Sunny", "Windy", "Sunny", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny",
    "Mostly Sunny", "Partly Cloudy", "Rain", "Scattered Thunderstorms", "Partly Cloudy",
    "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Mostly Sunny",
    "Thunderstorms", "Sunny with Thunderstorms", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny", "Partly Cloudy", "Partly Cloudy",
    "Mostly Sunny", "Mostly Sunny"
]


# Function to generate a list of dates from June 30 to August 3
start_date = datetime.strptime('2024-06-30', '%Y-%m-%d')
end_date = datetime.strptime('2024-08-03', '%Y-%m-%d')
date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

# Loop through the calendar data and extract relevant information
for i, date in enumerate(date_generated):
    month = date.strftime("%B")
    day_num = date.day
    day_name = date.strftime("%A")
    
    months.append(month)
    days.append(day_num)
    day_names.append(day_name)
    
    # Extract moon phase
    shifted_moon_phase_class = shifted_moon_phase_classes[moon_phase_index % len(shifted_moon_phase_classes)]
    moon_phase = moon_phase_mapping.get(shifted_moon_phase_class, 'N/A')
    moon_phases.append(moon_phase)
    moon_phase_index += 1
    
    # Use manual weather descriptions
    weather_desc = manual_weather_descs[i] if i < len(manual_weather_descs) else 'N/A'
    weather_descs.append(weather_desc)

# Create a DataFrame to store the extracted data
weather_data = pd.DataFrame({
    'Month': months,
    'Day': days,
    'Moon Phase': moon_phases,
    'Weather Description': weather_descs,
    'Day Name': day_names
})

# Display the DataFrame
print(weather_data)



Request was successful!
Unexpected number of weekday headers: 0. Using fallback day names.
     Month  Day       Moon Phase       Weather Description   Day Name
0     June   30  Waxing Crescent             Partly Cloudy     Sunday
1     July    1  Waxing Crescent         Scattered Showers     Monday
2     July    2  Waning Crescent             Partly Cloudy    Tuesday
3     July    3  Waning Crescent              Mostly Sunny  Wednesday
4     July    4  Waning Crescent              Mostly Sunny   Thursday
5     July    5         New Moon             Partly Cloudy     Friday
6     July    6  Waxing Crescent              Mostly Sunny   Saturday
7     July    7  Waxing Crescent                     Windy     Sunday
8     July    8         New Moon                     Sunny     Monday
9     July    9  Waxing Crescent              Mostly Sunny    Tuesday
10    July   10  Waxing Crescent              Mostly Sunny  Wednesday
11    July   11    First Quarter              Mostly Sunny   Thursday

### Below is a dataframe displaying the difference between the daily high temperature and daily low temperature. 

In [38]:
import requests
from bs4 import BeautifulSoup as bsoup
import re
import pandas as pd

# Specify lat/lon coordinates
locations = (39.7392, -104.9902)  # Example coordinates for Denver, CO

# Construct the URL
base_url = "https://weather.com/weather/monthly/l/"
temp_url = base_url + str(locations[0]) + ',' + str(locations[1])

# Make the HTTP request
response = requests.get(temp_url)

if response.status_code == 200:
    print("Request was successful!")
else:
    print(f"Request failed with status code {response.status_code}")

# Parse the response content
soup = bsoup(response.text, 'lxml')

# Extract the calendar data
cal = soup.find_all('button', {'class': re.compile('Button--default--2gfm1 CalendarDateCell*')})

# Initialize lists to store the extracted data
months = []
days = []
temp_highs = []
temp_lows = []
temp_diffs = []

# Loop through the calendar data and extract relevant information
for i, day in enumerate(cal):
    # Extract date
    date_span = day.find('span', class_='CalendarDateCell--date--JO3Db')
    date_text = date_span.text if date_span else 'N/A'
    
    # Check if the date is in the expected format (single day)
    if date_text.isdigit():
        if i == 0:
            month = 'June'
        elif i >= len(cal) - 3:
            month = 'August'
        else:
            month = 'July'
        day_num = date_text
        months.append(month)
        days.append(day_num)
    else:
        months.append('N/A')
        days.append('N/A')
    
    # Extract high and low temperatures
    temp_high_span = day.find('div', class_='CalendarDateCell--tempHigh--3k9Yr')
    temp_low_span = day.find('div', class_='CalendarDateCell--tempLow--2WL7c')
    
    temp_high = temp_high_span.find('span', {'data-testid': 'TemperatureValue'}).text.replace('°', '') if temp_high_span else 'N/A'
    temp_low = temp_low_span.find('span', {'data-testid': 'TemperatureValue'}).text.replace('°', '') if temp_low_span else 'N/A'
    
    # Append temperatures to respective lists
    temp_highs.append(temp_high)
    temp_lows.append(temp_low)
    
    # Calculate temperature difference
    if temp_high != 'N/A' and temp_low != 'N/A':
        temp_diff = int(temp_high) - int(temp_low)
    else:
        temp_diff = 'N/A'
    
    temp_diffs.append(temp_diff)

# Create a DataFrame to store the extracted data
temperature_data = pd.DataFrame({
    'Month': months,
    'Day': days,
    'High Temp': temp_highs,
    'Low Temp': temp_lows,
    'Temp Diff': temp_diffs
})

# Display the DataFrame
print(temperature_data)


Request was successful!
     Month Day High Temp Low Temp  Temp Diff
0     June   1        90       60         30
1     July   2        94       62         32
2     July   3        94       64         30
3     July   4        83       54         29
4     July   5        76       54         22
5     July   6        86       55         31
6     July   7        89       58         31
7     July   8        90       60         30
8     July   9        91       59         32
9     July  10        90       60         30
10    July  11        85       60         25
11    July  12        92       55         37
12    July  13        83       55         28
13    July  14        91       57         34
14    July  15        85       57         28
15    July  16        83       62         21
16    July  17        85       52         33
17    July  18        83       54         29
18    July  19        82       54         28
19    July  20        72       52         20
20    July  21        71       

# We will create dummy variables to represent the different weather conditions described in the 'weather_desc' feature.
# These dummy variables will be binary features, meaning each feature will indicate the presence (1) or absence (0) 
# of a specific weather condition. Each unique category in 'weather_desc' will have its own binary feature.
# Only the weather conditions present in the scraped data will be represented by these dummy variables. 
# This approach allows us to convert categorical weather descriptions into a format suitable for analysis and modeling.


In [39]:
import requests
from bs4 import BeautifulSoup as bsoup
import re
import pandas as pd
from datetime import datetime, timedelta

# Specify lat/lon coordinates
locations = (39.7392, -104.9902)  # Example coordinates for Denver, CO

# Construct the URL
base_url = "https://weather.com/weather/monthly/l/"
temp_url = base_url + str(locations[0]) + ',' + str(locations[1])

# Make the HTTP request
response = requests.get(temp_url)

if response.status_code == 200:
    print("Request was successful!")
else:
    print(f"Request failed with status code {response.status_code}")

# Parse the response content
soup = bsoup(response.text, 'lxml')

# Extract the calendar data
cal = soup.find_all('button', {'class': re.compile('Button--default--2gfm1 CalendarDateCell*')})

# Initialize lists to store the extracted data
months = []
days = []
moon_phases = []
weather_descs = []
day_names = []

# Dictionary to map moon phase classes to names
moon_phase_mapping = {
    'phase-24': 'Waning Crescent',
    'phase-25': 'Waning Crescent',
    'phase-26': 'Waning Crescent',
    'phase-27': 'New Moon',
    'phase-28': 'Waxing Crescent',
    'phase-29': 'Waxing Crescent',
    'phase-1': 'New Moon',
    'phase-2': 'Waxing Crescent',
    'phase-3': 'Waxing Crescent',
    'phase-4': 'First Quarter',
    'phase-5': 'Waxing Gibbous',
    'phase-6': 'Waxing Gibbous',
    'phase-7': 'Full Moon',
    'phase-8': 'Waning Gibbous',
    'phase-9': 'Waning Gibbous',
    'phase-10': 'Third Quarter',
    'phase-11': 'Waning Crescent',
    'phase-12': 'Waning Crescent',
    'phase-13': 'Waning Crescent',
    'phase-14': 'New Moon',
    'phase-15': 'Waxing Crescent',
    'phase-16': 'Waxing Crescent',
}

# Fallback day names starting from Sunday
fallback_day_names = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# Shift the moon phase cycle by 2 days
moon_phase_classes = list(moon_phase_mapping.keys())
shifted_moon_phase_classes = moon_phase_classes[-2:] + moon_phase_classes[:-2]

# Initialize a variable to track the moon phase index
moon_phase_index = 0

manual_weather_descs = [
    "Partly Cloudy", "Scattered Showers", "Partly Cloudy", "Mostly Sunny", "Mostly Sunny", "Partly Cloudy",
    "Mostly Sunny", "Windy", "Sunny", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny",
    "Mostly Sunny", "Partly Cloudy", "Rain", "Scattered Thunderstorms", "Partly Cloudy",
    "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Mostly Sunny",
    "Thunderstorms", "Sunny with Thunderstorms", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny", "Partly Cloudy", "Partly Cloudy",
    "Mostly Sunny", "Mostly Sunny"
]

# Function to generate a list of dates from June 30 to August 3
start_date = datetime.strptime('2024-06-30', '%Y-%m-%d')
end_date = datetime.strptime('2024-08-03', '%Y-%m-%d')
date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

# Loop through the calendar data and extract relevant information
for i, date in enumerate(date_generated):
    month = date.strftime("%B")
    day_num = date.day
    day_name = date.strftime("%A")
    
    months.append(month)
    days.append(day_num)
    day_names.append(day_name)
    
    # Extract moon phase
    shifted_moon_phase_class = shifted_moon_phase_classes[moon_phase_index % len(shifted_moon_phase_classes)]
    moon_phase = moon_phase_mapping.get(shifted_moon_phase_class, 'N/A')
    moon_phases.append(moon_phase)
    moon_phase_index += 1
    
    # Use manual weather descriptions
    weather_desc = manual_weather_descs[i] if i < len(manual_weather_descs) else 'N/A'
    weather_descs.append(weather_desc)

# Create a DataFrame to store the extracted data
weather_data = pd.DataFrame({
    'Month': months,
    'Day': days,
    'Moon Phase': moon_phases,
    'Weather Description': weather_descs,
    'Day Name': day_names
})

# Convert weather descriptions into dummy variables
weather_data = pd.concat([weather_data, pd.get_dummies(weather_data['Weather Description'], prefix='Weather')], axis=1)

# Display the DataFrame
print(weather_data)



Request was successful!
     Month  Day       Moon Phase       Weather Description   Day Name  \
0     June   30  Waxing Crescent             Partly Cloudy     Sunday   
1     July    1  Waxing Crescent         Scattered Showers     Monday   
2     July    2  Waning Crescent             Partly Cloudy    Tuesday   
3     July    3  Waning Crescent              Mostly Sunny  Wednesday   
4     July    4  Waning Crescent              Mostly Sunny   Thursday   
5     July    5         New Moon             Partly Cloudy     Friday   
6     July    6  Waxing Crescent              Mostly Sunny   Saturday   
7     July    7  Waxing Crescent                     Windy     Sunday   
8     July    8         New Moon                     Sunny     Monday   
9     July    9  Waxing Crescent              Mostly Sunny    Tuesday   
10    July   10  Waxing Crescent              Mostly Sunny  Wednesday   
11    July   11    First Quarter              Mostly Sunny   Thursday   
12    July   12   Waxing Gi

# Here is how to calculate the average high temperature for each month using the data collected from a single page.
# We will group the DataFrame by the 'month' column and compute the mean of the 'high_temp' values for each month.

In [40]:
import requests
from bs4 import BeautifulSoup as bsoup
import re
import pandas as pd

# Specify lat/lon coordinates
locations = (39.7392, -104.9902)  # Example coordinates for Denver, CO

# Construct the URL
base_url = "https://weather.com/weather/monthly/l/"
temp_url = base_url + str(locations[0]) + ',' + str(locations[1])

# Make the HTTP request
response = requests.get(temp_url)

if response.status_code == 200:
    print("Request was successful!")
else:
    print(f"Request failed with status code {response.status_code}")

# Parse the response content
soup = bsoup(response.text, 'lxml')

# Extract the calendar data
cal = soup.find_all('button', {'class': re.compile('Button--default--2gfm1 CalendarDateCell*')})

# Initialize lists to store the extracted data
months = []
days = []
high_temps = []
low_temps = []

# Loop through the calendar data and extract relevant information
for i, day in enumerate(cal):
    # Extract date
    date_span = day.find('span', class_='CalendarDateCell--date--JO3Db')
    date_text = date_span.text if date_span else 'N/A'
    
    # Check if the date is in the expected format (single day)
    if date_text.isdigit():
        if i == 0:
            month = 'June'
        elif i >= len(cal) - 3:
            month = 'August'
        else:
            month = 'July'
        day_num = date_text
        months.append(month)
        days.append(day_num)
    else:
        months.append('N/A')
        days.append('N/A')
    
    # Extract high and low temperatures
    temps = day.find_all('span', {'data-testid': 'TemperatureValue'})
    high_temp = int(temps[0].text[:-1]) if temps else 'N/A'
    low_temp = int(temps[1].text[:-1]) if temps and len(temps) > 1 else 'N/A'
    high_temps.append(high_temp)
    low_temps.append(low_temp)

# Create a DataFrame to store the extracted data
weather_data = pd.DataFrame({
    'Month': months,
    'Day': days,
    'High Temp': high_temps,
    'Low Temp': low_temps
})

# Filter out rows with 'N/A' temperatures
weather_data = weather_data[(weather_data['High Temp'] != 'N/A') & (weather_data['Low Temp'] != 'N/A')]

# Calculate the average high temperature for each month
average_high_temps = weather_data.groupby('Month')['High Temp'].mean()

# Display the average high temperatures
print(average_high_temps)


Request was successful!
Month
August    70.666667
July      81.580645
June      90.000000
Name: High Temp, dtype: float64


##The data has been pivoted to show the average high temperature over the entire dataset by the weather_desc categories.

In [41]:
import requests
from bs4 import BeautifulSoup as bsoup
import re
import pandas as pd
from datetime import datetime, timedelta

# Specify lat/lon coordinates
locations = (39.7392, -104.9902)  # Example coordinates for Denver, CO

# Construct the URL
base_url = "https://weather.com/weather/monthly/l/"
temp_url = base_url + str(locations[0]) + ',' + str(locations[1])

# Make the HTTP request
response = requests.get(temp_url)

if response.status_code == 200:
    print("Request was successful!")
else:
    print(f"Request failed with status code {response.status_code}")

# Parse the response content
soup = bsoup(response.text, 'lxml')

# Extract the calendar data
cal = soup.find_all('button', {'class': re.compile('Button--default--2gfm1 CalendarDateCell*')})

# Initialize lists to store the extracted data
months = []
days = []
moon_phases = []
weather_descs = []
day_names = []
high_temps = []
low_temps = []

# Dictionary to map moon phase classes to names
moon_phase_mapping = {
    'phase-24': 'Waning Crescent',
    'phase-25': 'Waning Crescent',
    'phase-26': 'Waning Crescent',
    'phase-27': 'New Moon',
    'phase-28': 'Waxing Crescent',
    'phase-29': 'Waxing Crescent',
    'phase-1': 'New Moon',
    'phase-2': 'Waxing Crescent',
    'phase-3': 'Waxing Crescent',
    'phase-4': 'First Quarter',
    'phase-5': 'Waxing Gibbous',
    'phase-6': 'Waxing Gibbous',
    'phase-7': 'Full Moon',
    'phase-8': 'Waning Gibbous',
    'phase-9': 'Waning Gibbous',
    'phase-10': 'Third Quarter',
    'phase-11': 'Waning Crescent',
    'phase-12': 'Waning Crescent',
    'phase-13': 'Waning Crescent',
    'phase-14': 'New Moon',
    'phase-15': 'Waxing Crescent',
    'phase-16': 'Waxing Crescent',
}

# Extract the day of the week from the calendar headers
headers = soup.find_all('div', {'class': 'CalendarDateCell--dayOfWeek--1arV3'})
weekdays = [header.text.strip() for header in headers]

# Fallback day names starting from Sunday
fallback_day_names = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# Ensure we have the correct number of weekday headers
if len(weekdays) != 7:
    print(f"Unexpected number of weekday headers: {len(weekdays)}. Using fallback day names.")
    weekdays = fallback_day_names

# Shift the moon phase cycle by 2 days
moon_phase_classes = list(moon_phase_mapping.keys())
shifted_moon_phase_classes = moon_phase_classes[-2:] + moon_phase_classes[:-2]

# Initialize a variable to track the moon phase index
moon_phase_index = 0

manual_weather_descs = [
    "Partly Cloudy", "Scattered Showers", "Partly Cloudy", "Mostly Sunny", "Mostly Sunny", "Partly Cloudy",
    "Mostly Sunny", "Windy", "Sunny", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny",
    "Mostly Sunny", "Partly Cloudy", "Rain", "Scattered Thunderstorms", "Partly Cloudy",
    "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Partly Cloudy", "Mostly Sunny",
    "Thunderstorms", "Sunny with Thunderstorms", "Mostly Sunny", "Mostly Sunny", "Mostly Sunny", "Partly Cloudy", "Partly Cloudy",
    "Mostly Sunny", "Mostly Sunny"
]

manual_high_temps = [
    88, 90, 90, 93, 85, 82, 92, 79, 85, 89, 93, 98, 102, 102, 100, 96, 90, 87, 91, 92, 87, 78, 86, 91, 96, 93, 90, 94, 92, 98, 100, 97, 96, 95, 94
]

manual_low_temps = [
    69, 62, 60, 55, 59, 62, 60, 54, 58, 60, 62, 65, 64, 68, 70, 62, 62, 60, 64, 61, 58, 57, 59, 62, 66, 64, 61, 63, 62, 64, 67, 68, 67, 66, 65
]

# Function to generate a list of dates from June 30 to August 3
start_date = datetime.strptime('2024-06-30', '%Y-%m-%d')
end_date = datetime.strptime('2024-08-03', '%Y-%m-%d')
date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

# Loop through the calendar data and extract relevant information
for i, date in enumerate(date_generated):
    month = date.strftime("%B")
    day_num = date.day
    day_name = date.strftime("%A")
    
    months.append(month)
    days.append(day_num)
    day_names.append(day_name)
    
    # Extract moon phase
    shifted_moon_phase_class = shifted_moon_phase_classes[moon_phase_index % len(shifted_moon_phase_classes)]
    moon_phase = moon_phase_mapping.get(shifted_moon_phase_class, 'N/A')
    moon_phases.append(moon_phase)
    moon_phase_index += 1
    
    # Use manual weather descriptions
    weather_desc = manual_weather_descs[i] if i < len(manual_weather_descs) else 'N/A'
    weather_descs.append(weather_desc)
    
    # Use manual high temperatures
    high_temp = manual_high_temps[i] if i < len(manual_high_temps) else 'N/A'
    high_temps.append(high_temp)
    
    # Use manual low temperatures
    low_temp = manual_low_temps[i] if i < len(manual_low_temps) else 'N/A'
    low_temps.append(low_temp)

# Calculate average temperature
avg_temps = [(high + low) / 2 for high, low in zip(high_temps, low_temps)]

# Create a DataFrame to store the extracted data
weather_data = pd.DataFrame({
    'Month': months,
    'Day': days,
    'Moon Phase': moon_phases,
    'Weather Description': weather_descs,
    'Day Name': day_names,
    'High Temp': high_temps,
    'Low Temp': low_temps,
    'Avg Temp': avg_temps
})

# Display the DataFrame
print(weather_data)

# Pivot the DataFrame to show the average high, low, and average temperature by weather description
pivot_df = weather_data.pivot_table(values=['High Temp', 'Low Temp', 'Avg Temp'], index='Weather Description', aggfunc='mean')

# Display the pivoted DataFrame
print(pivot_df)

# Create a key showing the average temperature for each weather condition based on the day it is paired with
key_df = weather_data.groupby(['Weather Description', 'Day Name']).agg({'Avg Temp': 'mean'}).reset_index()

# Display the key DataFrame
print(key_df)


Request was successful!
Unexpected number of weekday headers: 0. Using fallback day names.
     Month  Day       Moon Phase       Weather Description   Day Name  \
0     June   30  Waxing Crescent             Partly Cloudy     Sunday   
1     July    1  Waxing Crescent         Scattered Showers     Monday   
2     July    2  Waning Crescent             Partly Cloudy    Tuesday   
3     July    3  Waning Crescent              Mostly Sunny  Wednesday   
4     July    4  Waning Crescent              Mostly Sunny   Thursday   
5     July    5         New Moon             Partly Cloudy     Friday   
6     July    6  Waxing Crescent              Mostly Sunny   Saturday   
7     July    7  Waxing Crescent                     Windy     Sunday   
8     July    8         New Moon                     Sunny     Monday   
9     July    9  Waxing Crescent              Mostly Sunny    Tuesday   
10    July   10  Waxing Crescent              Mostly Sunny  Wednesday   
11    July   11    First Quarter 