In [None]:
from pyspark.sql import SparkSession
import pandas as pd

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("612FinalProject")
    .getOrCreate()
)

spark

#Cleaning and Combining Weather Data
This step was perhaps the most energy consuming when it comes to our data apprehensions itself, and while we tried to prevent to get to the files in our database we had some manual work.

https://acis.alberta.ca/weather-data-viewer.jsp  

To access weather data you could choose a weather station, and then from there download the data you needed (daily, hourly ect), of your choice, this website however enforced a CAPTCHA which for our skillset prevented an automated request, what also was interesting was that because our AESO DATA had it's own specific "regions", we had to be diligent to choose a weather station that we believed existed within the defined AESO region.  

We referenced this AESO MAP, which shows the boundaries of each Region that is used for our AESO Load Data.  
https://www.arcgis.com/apps/View/index.html?appid=88859e0179b44b47b3e77b0b384a18a7  

This was super helpful as our Weather Data also had an interactive map on the data page which showed the location of each weather station so through overlaying them, we could effectively choose a weather station that fell within the region (this map can be seen in the weather data link provided above).

This still left with the problem that limitations of our Weather data download could only be for to 184 days (while this was not ideal, it does give a great opportunity to demonstrate our data parsing and combining skills which will be done below0.  

So after identifying a station that fell into each of the AESO regions, we download 3 files for each station of HOURLY weather data which corresponded to 2023-11-01 - 2024-05-05,  2024-05-06 - 2024-10-01,  and finally 2024-10-02 - 2024-12-31, these dates are apart of the download file for our WEATHER DATA and together represent the timeline of our AESO LOAD data which is from 2023-11-01 - 2024-12-31 also HOURLY so this was inetntional so that of course we can match our data sets perfectly.  

However because our decisions of Weather Data Station belonging to a AESO region was a manual decision with no clear way to automate, each weather station CSV was just renamed at the end by adding the Region of our Choice (numerical number) to the end of the CSV name, which is found in our data folder under WeatherDataRaw, as everything is untouched and represents 42 Weather Stations data (representing the 42 Aeso Regions) x 3 (as we had to download the weather data again in chunks of 3 for each station.  

Below is our process of combining the 3 sets of data for the station into 1 dataframe for each station.

In [None]:
import os
import glob
import pandas as pd

weather_dir = "../data/WeatherDataRaw"

# Find all the ACIS files
weather_files = glob.glob(os.path.join(weather_dir, "ACISHourlyData-*.csv"))
print(f"Found {len(weather_files)} weather CSV files")

# 1) Group file paths by station code (last piece before .csv)
files_by_code = {}

for path in weather_files:
    fname = os.path.basename(path)             # An example of ACISHourlyData-20231101-20240505-4.csv
    name_no_ext = fname[:-4]                   # strip ".csv"
    parts = name_no_ext.split("-")             # ['ACISHourlyData','20231101','20240505','4']

    start_date = parts[1]                      # '20231101'
    code = parts[-1]                           # '4', '6', '13', ...

    files_by_code.setdefault(code, []).append((start_date, path))

print("Codes found:", sorted(files_by_code.keys()))

# 2) For each code, read its files in date order and make a combined DataFrame
date_col = "Date (Local Standard Time)"  # This was the column in the downloaded weather data which held the date and time (again
#this was collected hourly so it has a date and time which is standard, if we to make it one so we can sort

for code, lst in files_by_code.items():
    lst_sorted = sorted(lst, key=lambda t: t[0])  #This is the sorting of it by data so our combined data frame
    #is in order of date which should start at November 1 2023 at 0:00 and should end at Decmeber 31st 23:00

    dfs = []
    for start_date, path in lst_sorted:
        print(f"Reading code {code} file:", os.path.basename(path))
        df = pd.read_csv(path, encoding="latin1")  # encoding fix for UnicodeDecodeError -> Reference to ChatGPT to help us debug we had an error here for a while
        dfs.append(df)

    combined = pd.concat(dfs, ignore_index=True)

    # Steo to ensure that it's correctly order it's ordered by date/time, JUST A CHECK to drop for each time
    if date_col in combined.columns:
        combined[date_col] = pd.to_datetime(combined[date_col])
        combined = combined.sort_values(date_col).reset_index(drop=True)

    #This is now our name for our combined file which we are just calling combined_data and the respective region
    var_name = f"combined_data_{code}"
    globals()[var_name] = combined

    #a check to ensure the rows are correct, ideally each combined_weather_station data should be the same length
    #which would ensure all the downloaded data was correct with dates, and there are no mistakes in our combining process
    #this is still straightforward
    print(f"Created variable '{var_name}' with {len(combined)} rows")

print("Done combining all station codes.")



In [None]:
combined_data_4.head()    # Medicine Hat Weather Station (also the region name in AESO)
#looks good time to move one

Okay this all working beautiful as you can see above  

This is exactly what we wanted as we have found all the Regions (and codes correctly), which means our data looked good, and combined data set has the same number of Rows of 10247, which means their are no rows MISSING! And from our output which we xtended the data is merged in correct order, so we have a complete set for each station from November 1st 2023 to December 31st 2024 for each weather station (again this alligns beautifully, with our AESO data which we will work on next so we can eventually combine both after that is cleaned (and combined to our Urban and Rural classifier). Awesome, perhaps there may be NANS/NULL ect for our Air Temp. Inst and Precip.(mm), but we can check for that no problem.  

Lets now make all of our data into one complete data set so that we have our base for PIPELINE to be executed on it, as we learned in 612.

In [None]:

import numpy as np
import pandas as pd
# Using the exact loop that I printed above which gave me this list - we also of course have hard copies everywhere.
codes = ['4', '6', '13', '17', '18', '19', '20', '21', '22', '23',
         '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
         '34', '35', '36', '37', '38', '39', '40', '42', '43', '44',
         '45', '46', '47', '48', '49', '52', '53', '54', '55', '56',
         '57', '60']

frames = []

for code in codes:
    df = globals()[f"combined_data_{code}"].copy()

    # attach area code
    df["area_code"] = int(code)

    # ----- find the precipitation column, if any -----
    precip_candidates = [
        "Precip. (mm)",
        "Precip.(mm)",
        "Precip. Amount (mm)",
    ]
    precip_col = None
    for c in precip_candidates:
        if c in df.columns:
            precip_col = c
            break

    # build rename map
    rename_map = {
        "Date (Local Standard Time)": "timestamp",
        "Air Temp. Inst. (Â°C)": "temp_c",
    }
    if precip_col is not None:
        rename_map[precip_col] = "precip_mm"

    df = df.rename(columns=rename_map)

    # if still no precip_mm, create it (all NaN or 0.0 depending on what you want)
    if "precip_mm" not in df.columns:
        df["precip_mm"] = np.nan     # or 0.0 if you prefer

    # keep only modeling columns
    frames.append(df[["area_code", "timestamp", "temp_c", "precip_mm"]])

# combine all stations
weather_all_pd = pd.concat(frames, ignore_index=True)

# ensure timestamp is datetime
weather_all_pd["timestamp"] = pd.to_datetime(weather_all_pd["timestamp"])

print("Area codes in weather:", sorted(weather_all_pd["area_code"].unique()))
print(weather_all_pd.head())

# Now need to get it all back into the CSV so we can load it into our SPARK
weather_all_pd.to_csv("../data/weather_all_areas_hourly.csv", index=False)
print("Saved ../data/weather_all_areas_hourly.csv")
