In [2]:
!python -m pip install requests
!python -m pip install beautifulsoup4
!python -m pip install pandas













In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

In [4]:
def clean_url_for_filename(url):
    # Extract year and season from URL
    year_match = re.search(r'year=(\d{4})', url)
    season_match = re.search(r'season=([^&]*)', url)

    year = year_match.group(1) if year_match else 'unknown_year'
    season = season_match.group(1) if season_match and season_match.group(1) else 'all_seasons'

    return f"monarch_data_{year}_{season}"

In [5]:
def scrape_table_data(url):
    try:
        # Send a GET request to the URL
        print(f"Fetching data from {url}")
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table
        table = soup.find('table')
        if not table:
            print(f"No table found at {url}")
            return None

        # Find all table rows
        rows = table.find_all('tr')
        if not rows:
            print(f"No rows found in table at {url}")
            return None

        # Lists to store the data
        data = {
            'Date': [],
            'Town': [],
            'State/Province': [],
            'Latitude': [],
            'Longitude': [],
            'Number': [],
        }

        # Iterate through rows and extract data
        for row in rows:
            cols = row.find_all(['th', 'td'])
            if len(cols) >= 7:  # Ensure row has enough columns
                data['Date'].append(cols[1].get_text(strip=True))
                data['Town'].append(cols[2].get_text(strip=True))
                data['State/Province'].append(cols[3].get_text(strip=True))
                data['Latitude'].append(cols[4].get_text(strip=True))
                data['Longitude'].append(cols[5].get_text(strip=True))
                data['Number'].append(cols[6].get_text(strip=True))


        # Create DataFrame
        df = pd.DataFrame(data)
        return df

    except requests.RequestException as e:
        print(f"Failed to fetch data from {url}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
        return None


In [6]:
urls = [
       
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2017&season=fall", # 2017 Fall
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2018&season=fall", # 2018 Fall
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2019&season=fall", # 2019 Fall
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2020&season=fall", # 2020 Fall
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2021&season=fall", # 2021 Fall
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2022&season=fall", # 2022 Fall
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2023&season=fall", # 2023 Fall
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2024&season=fall", # 2024 Fall

        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2017&season=spring", # 2017 Spring
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2018&season=spring", # 2018 Spring
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2019&season=spring", # 2019 Spring
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2020&season=spring", # 2020 Spring
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2021&season=spring", # 2021 Spring
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2022&season=spring", # 2022 Spring
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2023&season=spring", # 2023 Spring
        # "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2024&season=spring", # 2024 Spring        


        
        
        "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2012&season=spring", # 2012 spring
        "https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2012&season=fall", # 2012 spring
]

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

In [None]:
import os

# Check if 'saved_data/' is a valid path
if not os.path.exists('saved_data/'):
    os.makedirs('saved_data/')
    print("'saved_data/' directory created.")
else:
    print("'saved_data/' directory already exists.")

In [7]:



for url in urls:
        df = scrape_table_data(url)
        if df is not None and not df.empty:
            base_filename = clean_url_for_filename(url)
            filename = f'saved_data/{base_filename}_{timestamp}.csv'

            # Save to CSV
            df.to_csv(filename, index=False)    
            print(f"Data from {url} saved to {filename}")
            print(f"Records in this file: {len(df)}")


            # Display first few rows
            print(f"\nFirst few rows of data from {filename}:")
            print(df.head())
            print("\n" + "="*50 + "\n")
        else:
            print(f"No data was successfully scraped from {url}")

Fetching data from https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2012&season=spring
Data from https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2012&season=spring saved to saved_data/monarch_data_2012_spring_20241006_072643.csv
Records in this file: 183

First few rows of data from saved_data/monarch_data_2012_spring_20241006_072643.csv:
       Date           Town  State/Province  Latitude  Longitude  Number
0      Date           Town  State/Province  Latitude  Longitude  Number
1  07/28/12  Springerville              AZ      34.2     -109.4       1
2  07/26/12       Chandler              AZ      33.3     -111.9       1
3  07/26/12   Newport News              VA      37.1      -76.5       1
4  07/25/12      Lakeville              NY      42.8      -77.7       2


Fetching data from https://journeynorth.org/sightings/querylist.html?map=monarch-adult-fall&year=2012&season=fall
Data from https://journeynorth.org/sightings/queryli

In [8]:
# read all the csv files
import glob
import os

directory = 'saved_data/'

# Get a list of all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
print(csv_files)
# Check if there are any CSV files in the directory
if not csv_files:
    print(f"No CSV files found in {directory}")

for file in csv_files:
    input_path = os.path.join(directory, file)
        
    try:
        # Read the CSV file, skipping the first two rows
        df = pd.read_csv(input_path)
        df=df.iloc[1:,:]
        # Create output filename
        file_name, file_extension = os.path.splitext(file)
        output_file = f"{file_name}{file_extension}"
        output_path = os.path.join(directory, output_file)

        # Drop the 	Unnamed: 0 column
        df=df.drop(columns=['Unnamed: 0'], inplace=True)
        

        # Save the modified DataFrame to a new CSV file
        df.to_csv(output_path)
        print(f"Successfully processed: {file} -> {output_file}")
            
    except Exception as e:
        print(f"Error processing {file}: {e}")


['cleaned_merged.csv', 'combined_fall_data.csv', 'coordinates.csv', 'filtered_data.csv', 'filtered_data_2.csv', 'monarch_data_2012_fall_20241006_072643.csv', 'monarch_data_2012_spring_20241006_072643.csv', 'monarch_data_2017_fall_20241005_164848.csv', 'monarch_data_2017_spring_20241005_164848.csv', 'monarch_data_2018_fall_20241005_164848.csv', 'monarch_data_2018_spring_20241005_164848.csv', 'monarch_data_2019_fall_20241005_164848.csv', 'monarch_data_2019_spring_20241005_164848.csv', 'monarch_data_2020_fall_20241005_164848.csv', 'monarch_data_2020_spring_20241005_164848.csv', 'monarch_data_2021_fall_20241005_164848.csv', 'monarch_data_2021_spring_20241005_164848.csv', 'monarch_data_2022_fall_20241005_164848.csv', 'monarch_data_2022_spring_20241005_164848.csv', 'monarch_data_2023_fall_20241005_164848.csv', 'monarch_data_2023_spring_20241005_164848.csv', 'monarch_data_2024_fall_20241005_164848.csv', 'monarch_data_2024_spring_20241005_164848.csv', 'updated_dataset.csv']
Error processing cl

In [9]:

df