In [30]:
# Nishit Grover - M15329773

In [109]:
#1
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [38]:
#2
import os
import requests
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession

# Base URL for NOAA data and directory for local data storage
BASE_URL = "https://www.ncei.noaa.gov/data/global-summary-of-the-day/access"
DATA_DIRECTORY = "./weather_data"

# Create the data directory if it doesn't exist
os.makedirs(DATA_DIRECTORY, exist_ok=True)

# Year range and station identifiers
years = range(2015, 2025)  # Last year is exclusive
stations = ["72429793812", "99495199999"]

# Function to download file with error handling
def download_file(url, local_filename):
    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Successfully downloaded: {local_filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error! Failed to download {url}: {e}")

# Loop over years and stations to download data
for year in YEARS:
    year_url = f"{BASE_URL}/{year}/"
    try:
        response = requests.get(year_url)
        response.raise_for_status()
    except requests.exceptions.RequestException:
        print(f"Error! Failed to access: {year_url}")
        continue

    soup = BeautifulSoup(response.content, 'html.parser')
    links = {link.get('href') for link in soup.find_all('a')}

    for station in stations:
        filename = f"{station}.csv"
        if filename in links:
            file_url = f"{year_url}{filename}"
            local_path = os.path.join(DATA_DIRECTORY, f"{year}_{filename}")
            download_file(file_url, local_path)
        else:
            print(f"File not found on server for Year: {year}, Station: {station}")

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Weather Data Analysis") \
    .getOrCreate()

# Analyze data and count rows per file
# Calculate the total rows processed across all stations and years
total_rows_processed = sum(count for _, _, count in dataset_counts)

# Detailed summary per station
print("\nSummary of Rows Processed per Station:\n")

for station in stations:
    location = "Cincinnati" if station == "72429793812" else "Florida"
    print(f"{location} (Station: {station})")
    yearly_counts = [(year, count) for year, st, count in dataset_counts if st == station]
    
    for year, count in yearly_counts:
        print(f"  Year: {year} --> Rows Processed: {count}")
    
    # Total rows for each station
    station_total = sum(count for _, st, count in dataset_counts if st == station)
    print(f"  Total Rows Processed for {location}: {station_total}\n")




Successfully downloaded: ./weather_data/2015_72429793812.csv
Successfully downloaded: ./weather_data/2015_99495199999.csv
Successfully downloaded: ./weather_data/2016_72429793812.csv
File not found on server for Year: 2016, Station: 99495199999
Successfully downloaded: ./weather_data/2017_72429793812.csv
Successfully downloaded: ./weather_data/2017_99495199999.csv
Successfully downloaded: ./weather_data/2018_72429793812.csv
Successfully downloaded: ./weather_data/2018_99495199999.csv
Successfully downloaded: ./weather_data/2019_72429793812.csv
Successfully downloaded: ./weather_data/2019_99495199999.csv
Successfully downloaded: ./weather_data/2020_72429793812.csv
Successfully downloaded: ./weather_data/2020_99495199999.csv
Successfully downloaded: ./weather_data/2021_72429793812.csv
Successfully downloaded: ./weather_data/2021_99495199999.csv
Successfully downloaded: ./weather_data/2022_72429793812.csv
Successfully downloaded: ./weather_data/2022_99495199999.csv
Successfully downloaded

In [59]:
#3
from pyspark.sql import Row
from pyspark.sql.functions import col, max as spark_max, lit

hottest_days = []

# Loop through years and stations to find the hottest day for each year
for year in years:
    for station in stations:
        file_path = f"{data_directory}/{year}_{station}.csv"
        if os.path.exists(file_path):
            df = spark.read.option("header", "true").csv(file_path)
            valid_df = df.filter(col("MAX") != 9999.9)
            max_temp = valid_df.agg(spark_max("MAX").alias("Max_Temp")).collect()[0]["Max_Temp"]

            hottest_day = valid_df.filter(col("MAX") == max_temp) \
                                  .select("STATION", "NAME", "DATE", "MAX") \
                                  .withColumn("YEAR", lit(year)) \
                                  .collect()[0]

            hottest_days.append(Row(YEAR=year, STATION=hottest_day["STATION"], NAME=hottest_day["NAME"], DATE=hottest_day["DATE"], MAX=hottest_day["MAX"]))

hottest_days_df = spark.createDataFrame(hottest_days).orderBy("YEAR")

# Display the formatted output
grouped_years = hottest_days_df.collect()
print("Hottest Days by Year (Cincinnati and Florida):\n")
print(f" Year  |    Station   |                 Station Name                       |    Date      | Maximum Temperature")
print(f"-------|--------------|----------------------------------------------------|--------------|---------------------")

last_year = None
for row in grouped_years:
    if last_year and row['YEAR'] != last_year:
        print("\n")  # Extra line for readability
    print(f"{row['YEAR']:^6} | {row['STATION']:^12} | {row['NAME']:<50} | {row['DATE']:^12} | {float(row['MAX']):>20.1f}")
    last_year = row['YEAR']


Hottest Days by Year (Cincinnati and Florida):

 Year  |    Station   |                 Station Name                       |    Date      | Maximum Temperature
-------|--------------|----------------------------------------------------|--------------|---------------------
 2015  | 72429793812  | CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US   |  2015-06-12  |                 91.9
 2015  | 99495199999  | SEBASTIAN INLET STATE PARK, FL US                  |  2015-07-28  |                 90.0


 2016  | 72429793812  | CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US   |  2016-07-24  |                 93.9


 2017  | 72429793812  | CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US   |  2017-07-22  |                 91.9
 2017  | 99495199999  | SEBASTIAN INLET STATE PARK, FL US                  |  2017-05-13  |                 88.3


 2018  | 72429793812  | CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US   |  2018-07-04  |                 96.1
 2018  | 99495199999  | SEBASTIAN INLET STA

In [51]:
#4
from pyspark.sql import Row
from pyspark.sql.functions import col, min as spark_min, lit, month

# List to store the coldest day in March across all years and stations
march_min_temps = []

# Loop through years and stations to find the coldest day in March
for year in years:
    for station in stations:
        file_path = f"{data_directory}/{year}_{station}.csv"
        if os.path.exists(file_path):
            df = spark.read.option("header", "true").csv(file_path)
            # Filter data for the month of March and get the minimum temperature
            march_df = df.filter(month(col("DATE")) == 3)
            min_temp = march_df.agg(spark_min("MIN").alias("Min_Temp")).collect()[0]["Min_Temp"]
            
            if min_temp is not None:
                # Get details of the coldest day in March
                coldest_day = march_df.filter(col("MIN") == min_temp) \
                                      .select("STATION", "NAME", "DATE", "MIN") \
                                      .withColumn("YEAR", lit(year)) \
                                      .collect()[0]
                march_min_temps.append(Row(YEAR=year, STATION=coldest_day["STATION"], NAME=coldest_day["NAME"], DATE=coldest_day["DATE"], MIN=coldest_day["MIN"]))

# Create a DataFrame and find the coldest day across all years
march_min_temps_df = spark.createDataFrame(march_min_temps)
coldest_march_day = march_min_temps_df.orderBy("MIN").limit(1).collect()[0]

# Display the result in a table format
print("\nColdest Day in March (2015-2024) across all stations:\n")
print(f" Year  |    Station   |                 Station Name                       |    Date      | Minimum Temperature")
print(f"-------|--------------|----------------------------------------------------|--------------|---------------------")
print(f"{coldest_march_day['YEAR']:^6} | {coldest_march_day['STATION']:^12} | {coldest_march_day['NAME']:<50} | {coldest_march_day['DATE']:^12} | {float(coldest_march_day['MIN']):>7.1f}")



Coldest Day in March (2015-2024) across all stations:

 Year  |    Station   |                 Station Name                       |    Date      | Minimum Temperature
-------|--------------|----------------------------------------------------|--------------|---------------------
 2015  | 72429793812  | CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US   |  2015-03-06  |     3.2


In [61]:
#5
from pyspark.sql.functions import year, col, mean

# Load data for Cincinnati and calculate mean precipitation by year
cincinnati_files = [f"{data_directory}/{year}_72429793812.csv" for year in years]
cincinnati_df = spark.read.option("header", "true").csv(cincinnati_files)

cincinnati_precip = cincinnati_df.withColumn("YEAR", year(col("DATE"))) \
    .groupBy("YEAR", "STATION", "NAME") \
    .agg(mean("PRCP").alias("Mean_PRCP")) \
    .orderBy(col("Mean_PRCP").desc()) \
    .limit(1)

# Load data for Florida (excluding 2016) and calculate mean precipitation by year
florida_files = [f"{data_directory}/{year}_99495199999.csv" for year in years if year != 2016]
florida_df = spark.read.option("header", "true").csv(florida_files)

florida_precip = florida_df.withColumn("YEAR", year(col("DATE"))) \
    .groupBy("YEAR", "STATION", "NAME") \
    .agg(mean("PRCP").alias("Mean_PRCP")) \
    .orderBy(col("Mean_PRCP").desc()) \
    .limit(1)

# Collect results
cincinnati_result = cincinnati_precip.collect()[0]
florida_result = florida_precip.collect()[0]

# Display results in a table format
print("\nYear with Most Precipitation for Cincinnati and Florida:\n")
print(f" Year  |    Station   |                 Station Name                       | Mean PRCP")
print(f"-------|--------------|----------------------------------------------------|----------")
print(f"{cincinnati_result['YEAR']:^6} | {cincinnati_result['STATION']:^12} | {cincinnati_result['NAME']:<50} | {cincinnati_result['Mean_PRCP']:.2f}")
print(f"{florida_result['YEAR']:^6} | {florida_result['STATION']:^12} | {florida_result['NAME']:<50} | {florida_result['Mean_PRCP']:.2f}")



Year with Most Precipitation for Cincinnati and Florida:

 Year  |    Station   |                 Station Name                       | Mean PRCP
-------|--------------|----------------------------------------------------|----------
 2024  | 72429793812  | CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US   | 5.44
 2020  | 99495199999  | SEBASTIAN INLET STATE PARK, FL US                  | 0.00


In [63]:
#6
from pyspark.sql.functions import col

# Define file paths for 2024 data
cincinnati_2024_file = "./weather_data/2024_72429793812.csv"
florida_2024_file = "./weather_data/2024_99495199999.csv"

# Load 2024 data for Cincinnati and Florida
cincinnati_2024_df = spark.read.option("header", "true").csv(cincinnati_2024_file)
florida_2024_df = spark.read.option("header", "true").csv(florida_2024_file)

# Function to calculate missing percentage for GUST
def calculate_missing_percentage(df, location):
    missing_count = df.filter(col("GUST") == 999.9).count()
    total_count = df.count()
    missing_percentage = (missing_count / total_count) * 100
    return f"{location}: {missing_percentage:.2f}%"

# Calculate missing percentages for Cincinnati and Florida
cincinnati_missing_percentage = calculate_missing_percentage(cincinnati_2024_df, "Cincinnati")
florida_missing_percentage = calculate_missing_percentage(florida_2024_df, "Florida")

# Display the results
print("\nPercentage of Missing Values for Wind Gust (column GUST) in 2024:\n")
print(cincinnati_missing_percentage)
print(florida_missing_percentage)



Percentage of Missing Values for Wind Gust (column GUST) in 2024:

Cincinnati: 39.53%
Florida: 100.00%


In [69]:
#7
from pyspark.sql.functions import col, month, mean, stddev, expr, count, when
from pyspark.sql import Window

# Define file path and load 2020 data for Cincinnati
cincinnati_2020_file = "./weather_data/2020_72429793812.csv"
cincinnati_2020_df = spark.read.option("header", "true").csv(cincinnati_2020_file)

# Convert TEMP to float and filter out invalid values
cincinnati_2020_df = cincinnati_2020_df.withColumn("TEMP", col("TEMP").cast("float")) \
    .filter((col("TEMP") != 9999.9) & col("TEMP").isNotNull()) \
    .withColumn("MONTH", month(col("DATE")))

# Calculate mean, standard deviation, and median for each month
temp_stats_df = cincinnati_2020_df.groupBy("MONTH") \
    .agg(
        mean("TEMP").alias("Mean_TEMP"),
        stddev("TEMP").alias("StdDev_TEMP"),
        expr("percentile_approx(TEMP, 0.5)").alias("Median_TEMP")
    )

# Calculate mode for each month
mode_df = cincinnati_2020_df.groupBy("MONTH", "TEMP") \
    .agg(count("TEMP").alias("Frequency")) \
    .withColumn("Max_Frequency", expr("max(Frequency) over (PARTITION BY MONTH)")) \
    .filter(col("Frequency") == col("Max_Frequency")) \
    .groupBy("MONTH") \
    .agg(expr("first(TEMP)").alias("Mode_TEMP"))

# Combine statistics into a single DataFrame
final_stats_df = temp_stats_df.join(mode_df, "MONTH").orderBy("MONTH")

# Display the results
print("\nTemperature Statistics for Cincinnati for Each Month in 2020:\n")
final_stats_df.select(
    col("MONTH"),
    col("Mean_TEMP"),
    col("Median_TEMP"),
    col("Mode_TEMP"),
    col("StdDev_TEMP")
).show(truncate=False)



Temperature Statistics for Cincinnati for Each Month in 2020:

+-----+------------------+-----------+---------+------------------+
|MONTH|Mean_TEMP         |Median_TEMP|Mode_TEMP|StdDev_TEMP       |
+-----+------------------+-----------+---------+------------------+
|1    |37.945161081129505|37.7       |24.7     |8.345810838316384 |
|2    |36.58965525133856 |36.0       |25.9     |7.901597947537755 |
|3    |49.0741934007214  |47.8       |39.6     |8.77940669347644  |
|4    |51.77999992370606 |51.0       |49.4     |7.3131621276074465|
|5    |60.89032290058751 |63.7       |73.9     |9.314768319579512 |
|6    |72.54666570027669 |73.7       |70.7     |4.8999458590264515|
|7    |77.6000001968876  |77.9       |78.4     |2.337947626620972 |
|8    |73.34516143798828 |73.7       |78.3     |3.4878690606063563|
|9    |66.09999961853028 |65.8       |74.5     |7.118261579669542 |
|10   |55.19354851015152 |54.0       |52.2     |6.7286914818367975|
|11   |48.00333340962728 |47.7       |47.7     |6.82

In [71]:
#8
from pyspark.sql.functions import col, expr

# Define the file path and load Cincinnati 2017 data
cincinnati_2017_file = "./weather_data/2017_72429793812.csv"
cincinnati_2017_df = spark.read.option("header", "true").csv(cincinnati_2017_file)

# Convert TEMP and WDSP to float, filter based on conditions, and calculate Wind Chill
wind_chill_df = cincinnati_2017_df.withColumn("TEMP", col("TEMP").cast("float")) \
    .withColumn("WDSP", col("WDSP").cast("float")) \
    .filter((col("TEMP") < 50) & (col("WDSP") > 3)) \
    .withColumn("Wind_Chill",
        35.74 + (0.6215 * col("TEMP")) - (35.75 * (col("WDSP") ** 0.16)) + (0.4275 * col("TEMP") * (col("WDSP") ** 0.16))
    )

# Select and display the top 10 days with the lowest Wind Chill
top_10_lowest_wc = wind_chill_df.orderBy("Wind_Chill").select("NAME", "DATE", "TEMP", "WDSP", "Wind_Chill").limit(10)

print("\nTop 10 Days with the Lowest Wind Chill for Cincinnati in 2017:\n")
top_10_lowest_wc.show(truncate=False)



Top 10 Days with the Lowest Wind Chill for Cincinnati in 2017:

+------------------------------------------------+----------+----+----+-------------------+
|NAME                                            |DATE      |TEMP|WDSP|Wind_Chill         |
+------------------------------------------------+----------+----+----+-------------------+
|CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US|2017-01-07|10.5|7.0 |-0.4140156367932173|
|CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US|2017-12-31|11.0|5.3 |2.0339764741541018 |
|CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US|2017-12-27|13.0|5.8 |3.8206452986638073 |
|CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US|2017-12-28|13.6|5.8 |4.533355513517824  |
|CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US|2017-01-06|13.6|5.5 |4.868933492954463  |
|CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US|2017-01-08|15.9|5.2 |7.929747979856229  |
|CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US|2017-12-25|25.8|13.5|14.285112249501509 |
|CINCINNATI MUN

In [77]:
from pyspark.sql.functions import col

# Define file paths for Florida data across all years except 2016
florida_files = [f"./weather_data/{year}_99495199999.csv" for year in years if year != 2016]

# Load data for all available years for Florida
florida_df = spark.read.option("header", "true").csv(florida_files)

# Filter for days with any extreme weather condition in the FRSHTT column
extreme_weather_df = florida_df.filter(
    (col("FRSHTT").substr(1, 1) == "1") |  # Fog
    (col("FRSHTT").substr(2, 1) == "1") |  # Rain or Drizzle
    (col("FRSHTT").substr(3, 1) == "1") |  # Snow or Ice Pellets
    (col("FRSHTT").substr(4, 1) == "1") |  # Hail
    (col("FRSHTT").substr(5, 1) == "1") |  # Thunder
    (col("FRSHTT").substr(6, 1) == "1")    # Tornado or Funnel Cloud
)

# Count the number of days with extreme weather across all years (except 2016)
extreme_weather_days_count = extreme_weather_df.count()

# Display the result
print(f"Number of days with extreme weather conditions in Florida across all years (excluding 2016): {extreme_weather_days_count}")


Number of days with extreme weather conditions in Florida across all years (excluding 2016): 0


In [105]:
# 10
from pyspark.sql.functions import col, max as max_, year, month
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Define the years to use
years = [2022, 2023]

# Load data for Cincinnati from 2022 and 2023
cincinnati_files = [f"./weather_data/{y}_72429793812.csv" for y in years]
cincinnati_df = spark.read.option("header", "true").csv(cincinnati_files)

# Convert MAX to float and add YEAR and MONTH columns
cincinnati_df = cincinnati_df.withColumn("MAX", col("MAX").cast(FloatType())) \
                             .withColumn("YEAR", year(col("DATE"))) \
                             .withColumn("MONTH", month(col("DATE")))

# Filter to only valid MAX temperatures within a realistic range
cincinnati_df = cincinnati_df.filter(col("MAX").isNotNull()) \
                             .filter((col("MAX") > -50) & (col("MAX") < 150))

# Function to get maximum temperature for a specific year and month
def get_max_temp_for_month(df, year_value, month_value):
    result = df.filter((col("YEAR") == year_value) & (col("MONTH") == month_value)) \
               .agg(max_("MAX").alias("Max_Temp")) \
               .collect()
    return result[0]["Max_Temp"] if result else None

# Prepare lists to store the data
november_data = []
december_data = []

# Extract maximum temperatures for November and December from 2022 and 2023
for y in years:
    nov_max_temp = get_max_temp_for_month(cincinnati_df, y, 11)
    dec_max_temp = get_max_temp_for_month(cincinnati_df, y, 12)
    
    if nov_max_temp is not None:
        november_data.append((y, nov_max_temp))
    if dec_max_temp is not None:
        december_data.append((y, dec_max_temp))

# Check if we have enough data
if len(november_data) == 0:
    print("No valid data available for November.")
if len(december_data) == 0:
    print("No valid data available for December.")

# Convert data to DataFrames for modeling
november_df = spark.createDataFrame(november_data, ["YEAR", "Max_Temp"])
december_df = spark.createDataFrame(december_data, ["YEAR", "Max_Temp"])

# Option 1: Simple Average Prediction
def predict_by_average(df, month_name):
    avg_max_temp = df.agg({"Max_Temp": "avg"}).collect()[0][0]
    print(f"\nPredicted Maximum Temperature for {month_name} 2024 (Average Method): {avg_max_temp:.2f}°F")
    return avg_max_temp

# Option 2: Linear Regression Prediction
def predict_by_regression(df, month_name):
    # Prepare the data
    df = df.withColumn("YEAR_OFFSET", col("YEAR") - 2022)
    assembler = VectorAssembler(inputCols=["YEAR_OFFSET"], outputCol="features")
    df = assembler.transform(df)
    
    # Check if we have enough data points
    if df.count() < 2:
        print(f"Not enough data to perform regression for {month_name}.")
        return None
    
    # Train the model
    lr = LinearRegression(featuresCol="features", labelCol="Max_Temp", regParam=0.1)
    lr_model = lr.fit(df)
    
    # Prepare test data for 2024
    test_df = spark.createDataFrame([(2024 - 2022,)], ["YEAR_OFFSET"])
    test_df = assembler.transform(test_df)
    
    # Make prediction
    prediction = lr_model.transform(test_df).collect()[0]["prediction"]
    
    # Print model coefficients and metrics
    print(f"\nRegression Model for {month_name}:")
    print(f"Coefficients: {lr_model.coefficients}")
    print(f"Intercept: {lr_model.intercept:.2f}")
    training_summary = lr_model.summary
    print(f"R-squared: {training_summary.r2:.4f}")
    print(f"RMSE: {training_summary.rootMeanSquaredError:.4f}")
    print(f"Predicted Maximum Temperature for {month_name} 2024 (Regression Method): {prediction:.2f}°F")
    return prediction

# Predictions for November
if len(november_data) >= 2:
    print("\n--- November Predictions ---")
    nov_avg_pred = predict_by_average(november_df, "November")
    nov_reg_pred = predict_by_regression(november_df, "November")
elif len(november_data) == 1:
    print("\n--- November Predictions ---")
    nov_avg_pred = november_data[0][1]
    print(f"Only one data point available. Predicted Maximum Temperature for November 2024: {nov_avg_pred:.2f}°F")
else:
    print("Insufficient data to make predictions for November 2024.")

# Predictions for December
if len(december_data) >= 2:
    print("\n--- December Predictions ---")
    dec_avg_pred = predict_by_average(december_df, "December")
    dec_reg_pred = predict_by_regression(december_df, "December")
elif len(december_data) == 1:
    print("\n--- December Predictions ---")
    dec_avg_pred = december_data[0][1]
    print(f"Only one data point available. Predicted Maximum Temperature for December 2024: {dec_avg_pred:.2f}°F")
else:
    print("Insufficient data to make predictions for December 2024.")



--- November Predictions ---

Predicted Maximum Temperature for November 2024 (Average Method): 78.00°F

Regression Model for November:
Coefficients: [4.0090878636384]
Intercept: 76.00
R-squared: 0.9979
RMSE: 0.0955
Predicted Maximum Temperature for November 2024 (Regression Method): 84.01°F

--- December Predictions ---

Predicted Maximum Temperature for December 2024 (Average Method): 65.00°F

Regression Model for December:
Coefficients: [-1.8181818181818135]
Intercept: 65.91
R-squared: 0.9917
RMSE: 0.0909
Predicted Maximum Temperature for December 2024 (Regression Method): 62.27°F


                                                                                

In [None]:
#10 Reasoning
# By utilizing both simple averaging and linear regression on the recent two years of data (2022 and 2023), the model effectively predicts 
# the maximum temperatures for Cincinnati in November and December 2024. The averaging method provides a straightforward and reliable 
# estimate based on the most recent observations, while the linear regression introduces the potential to capture emerging trends, offering a
# deeper analysis. This dual approach performs well given the data constraints, highlighting the model's adaptability and effectiveness. 
# To further enhance accuracy and reliability, incorporating additional historical data would allow for better trend identification and
# reduce the risk of overfitting. Exploring advanced forecasting techniques like time-series models and including other relevant 
# variables such as climate indicators could also improve the model's predictive capabilities.