In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import datetime
import os

def parse_weather_from_file(month_name, html_file_path):
    """
    Parse weather data from a local HTML file and save to JSON

    Args:
        month_name: Name of the month (e.g., 'October')
        html_file_path: Path to the downloaded HTML file
    """
    print(f"\n{'='*60}")
    print(f"Processing {month_name} from local file...")
    print(f"{'='*60}")

    # Check if file exists
    if not os.path.exists(html_file_path):
        print(f"âœ— ERROR: File not found at: {html_file_path}")
        print("Please make sure the file path is correct.")
        return False

    try:
        # Read the saved HTML file
        with open(html_file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

        print(f"âœ“ Successfully loaded HTML file from: {html_file_path}")

    except Exception as e:
        print(f"âœ— Error reading file: {e}")
        return False

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    print(f"âœ“ Page title: {soup.title.string if soup.title else 'No title found'}")

    weather_data = []

    # Find all monthly day panels
    article_blocks = soup.find_all('a', class_='monthly-daypanel')

    if not article_blocks:
        article_blocks = soup.find_all('div', class_='monthly-daypanel')

    print(f"âœ“ Found {len(article_blocks)} weather data blocks")

    for block in article_blocks:
        # Extract day of the month
        day_tag = block.find('div', class_='date')
        day_of_month = day_tag.get_text().strip() if day_tag else None

        # Find the temp container
        temp_container = block.find('div', class_='temp')

        max_temp = None
        min_temp = None

        if temp_container:
            # Extract maximum temperature
            high_tag = temp_container.find('div', class_='high')
            max_temp = high_tag.get_text().strip() if high_tag else None

            # Extract minimum temperature
            low_tag = temp_container.find('div', class_='low')
            min_temp = low_tag.get_text().strip() if low_tag else None

        # Extract weather description from SVG alt attribute
        icon_container = block.find('div', class_='icon-container')
        weather_description = '-'

        if icon_container:
            svg_tag = icon_container.find('svg')
            if svg_tag and svg_tag.has_attr('alt'):
                weather_description = svg_tag['alt']

        # Only add if we have at least the day
        if day_of_month:
            weather_data.append({
                'Day of the month': day_of_month,
                'Maximum predicted weather': max_temp if max_temp else '--',
                'Minimum predicted weather': min_temp if min_temp else '--',
                'General weather description': weather_description
            })

    # Create DataFrame
    df = pd.DataFrame(weather_data)

    print(f"\n--- Sample of extracted data (first 3 days) ---")
    for i, data in enumerate(weather_data[:3]):
        print(f"  Day {data['Day of the month']}: {data['Maximum predicted weather']}/{data['Minimum predicted weather']}")

    # Save to JSON
    json_filename = f'{month_name.lower()}_weather_data.json'

    weather_json = {
        'month': month_name,
        'year': datetime.datetime.now().year,
        'scraped_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'source': 'Local HTML file',
        'source_file': os.path.basename(html_file_path),
        'location': 'Timisoara, Romania',
        'temperature_unit': 'Celsius',
        'total_days': len(df),
        'data': weather_data
    }

    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(weather_json, f, indent=4, ensure_ascii=False)

    print(f"âœ“ Saved to {json_filename}")

    # Analysis (only for days with valid temperature data)
    df_with_temps = df[df['Maximum predicted weather'] != '--'].copy()

    if len(df_with_temps) > 0:
        # Convert temperatures to integers (remove the Â° symbol)
        df_with_temps['Max_Temp_Numeric'] = df_with_temps['Maximum predicted weather'].str.replace('Â°', '', regex=False).astype(int)
        df_with_temps['Min_Temp_Numeric'] = df_with_temps['Minimum predicted weather'].str.replace('Â°', '', regex=False).astype(int)

        avg_max = df_with_temps['Max_Temp_Numeric'].mean()
        avg_min = df_with_temps['Min_Temp_Numeric'].mean()
        low_min = df_with_temps['Min_Temp_Numeric'].min()
        high_max = df_with_temps['Max_Temp_Numeric'].max()

        print(f'\nðŸ“Š Analysis:')
        print(f'   â€¢ Avg Max: {avg_max:.1f}Â°C | Avg Min: {avg_min:.1f}Â°C')
        print(f'   â€¢ Range: {low_min}Â°C to {high_max}Â°C')

        # Count rainy days
        rainy_days = df_with_temps[df_with_temps['General weather description'].str.lower().str.contains('rain|shower|ploaie', na=False)]
        if len(rainy_days) > 0:
            print(f'   â€¢ Rainy days: {len(rainy_days)} ({", ".join(rainy_days["Day of the month"].astype(str).tolist())})')

    return True


# Main execution
if __name__ == "__main__":
    print("\n" + "="*60)
    print("Weather Data Parser - Local HTML Files")
    print("Guaranteed Celsius temperatures from downloaded files!")
    print("="*60)

    # ============================================================
    # CONFIGURE YOUR FILE PATHS HERE
    # ============================================================
    # Update these paths to point to your downloaded HTML files

    months = [
        ('October', '/content/october-weather.html'),
        ('November', '/content/november-weather.html'),
        ('December', '/content/december-weather.html')
    ]

    # Example paths for different operating systems:
    # Windows: 'C:/Users/YourName/Downloads/october_weather.html'
    # Linux:   '/home/username/Downloads/october_weather.html'
    # Mac:     '/Users/username/Downloads/october_weather.html'

    print("\nâš  IMPORTANT: Update the file paths in the script!")
    print("Current paths to check:")
    for month_name, file_path in months:
        exists = "âœ“ EXISTS" if os.path.exists(file_path) else "âœ— NOT FOUND"
        print(f"  {month_name}: {file_path} [{exists}]")

    print("\nIf files are not found, edit the 'months' list in the script.")
    print("Press Enter to continue or Ctrl+C to exit and update paths...")
    input()

    # ============================================================

    successful = 0
    failed = 0

    for month_name, file_path in months:
        if parse_weather_from_file(month_name, file_path):
            successful += 1
        else:
            failed += 1

    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"âœ“ Successful: {successful} | âœ— Failed: {failed}")

    if successful > 0:
        print(f"\nJSON files created:")
        for month_name, _ in months:
            json_file = f'{month_name.lower()}_weather_data.json'
            if os.path.exists(json_file):
                print(f"  âœ“ {json_file}")

    print("\nâœ… All data is in Celsius from your downloaded HTML files!")
    print("="*60)



Weather Data Parser - Local HTML Files
Guaranteed Celsius temperatures from downloaded files!

âš  IMPORTANT: Update the file paths in the script!
Current paths to check:
  October: /content/october-weather.html [âœ“ EXISTS]
  November: /content/november-weather.html [âœ“ EXISTS]
  December: /content/december-weather.html [âœ“ EXISTS]

If files are not found, edit the 'months' list in the script.
Press Enter to continue or Ctrl+C to exit and update paths...


Processing October from local file...
âœ“ Successfully loaded HTML file from: /content/october-weather.html
âœ“ Page title: Vremea pe lunÄƒ  TimiÈ™oara, TimiÈ™, RomÃ¢nia | AccuWeather
âœ“ Found 35 weather data blocks

--- Sample of extracted data (first 3 days) ---
  Day 28: 22Â°/11Â°
  Day 29: 19Â°/7Â°
  Day 30: 16Â°/5Â°
âœ“ Saved to october_weather_data.json

ðŸ“Š Analysis:
   â€¢ Avg Max: 17.8Â°C | Avg Min: 6.1Â°C
   â€¢ Range: 1Â°C to 24Â°C

Processing November from local file...
âœ“ Successfully loaded HTML file from: /conte