In [39]:
# path = '/content/drive/MyDrive/response_1733276056602.json'
path = './raw_params.json'

In [40]:

import json

# Open and read the JSON file
with open(path, 'r') as file:
    data = json.load(file)

print(data)

{'PRECSNOLAND': {'type': 'METEOROLOGY', 'temporal': 'HOURLY', 'source': 'MERRA2', 'community': 'RE', 'calculated': False, 'inputs': 'null', 'units': 'mm/hour', 'name': 'Snow Precipitation Land', 'definition': 'The snow precipitation only over land at the surface of the earth.'}, 'PRECTOTCORR': {'type': 'METEOROLOGY', 'temporal': 'HOURLY', 'source': 'MERRA2', 'community': 'RE', 'calculated': False, 'inputs': 'null', 'units': 'mm/hour', 'name': 'Precipitation Corrected', 'definition': 'The bias corrected average of total precipitation at the surface of the earth in water mass (includes water content in snow).'}, 'PS': {'type': 'METEOROLOGY', 'temporal': 'HOURLY', 'source': 'MERRA2', 'community': 'RE', 'calculated': False, 'inputs': 'null', 'units': 'kPa', 'name': 'Surface Pressure', 'definition': 'The average of surface pressure at the surface of the earth.'}, 'QV10M': {'type': 'METEOROLOGY', 'temporal': 'HOURLY', 'source': 'MERRA2', 'community': 'RE', 'calculated': False, 'inputs': 'nul

In [41]:
## Extract ALLSKY_SFC_SW_DWN
solar_irradiance = {
    "ALLSKY_SFC_SW_DWN": data.pop("ALLSKY_SFC_SW_DWN")
}
print(solar_irradiance)

## Extract WS50M

wind_speed = {
    "WS50M": data.pop("WS50M")
}
print(wind_speed)

## Remove elevation parameters (they requires elevation input)
data.pop("WSC")
data.pop("PSC")

## Remove snow related parameters
data.pop("PRECSNOLAND")
data.pop("SNODP")

{'ALLSKY_SFC_SW_DWN': {'type': 'RADIATION', 'temporal': 'HOURLY', 'source': 'CERES', 'community': 'RE', 'calculated': False, 'inputs': 'null', 'units': 'Wh/m^2', 'name': 'All Sky Surface Shortwave Downward Irradiance', 'definition': 'The total solar irradiance incident (direct plus diffuse) on a horizontal plane at the surface of the earth under all sky conditions. An alternative term for the total solar irradiance is the "Global Horizontal Irradiance" or GHI.'}}
{'WS50M': {'type': 'METEOROLOGY', 'temporal': 'HOURLY', 'source': 'POWER', 'community': 'RE', 'calculated': True, 'inputs': ['U50M', 'V50M'], 'units': 'm/s', 'name': 'Wind Speed at 50 Meters', 'definition': 'The average of wind speed at 50 meters above the surface of the earth.'}}


{'type': 'METEOROLOGY',
 'temporal': 'HOURLY',
 'source': 'MERRA2',
 'community': 'RE',
 'calculated': False,
 'inputs': 'null',
 'units': 'cm',
 'name': 'Snow Depth',
 'definition': 'The snow depth on land at surface of the earth.'}

In [42]:
import random

keys = list(data.keys())

set_size = 3
sets = []
for i in range(0, len(keys), set_size):
    sets.append(keys[i:i+set_size])

batch_params = []
for idx,s in enumerate(sets):
  batch_params.append("")
  for key in s:
    batch_params[idx] = batch_params[idx] + "{},".format(key)
  batch_params[idx] = batch_params[idx][:-1]

batch_params

['PRECTOTCORR,PS,QV10M',
 'QV2M,RH2M,T2M',
 'TS,U10M,U2M',
 'U50M,V10M,V2M',
 'V50M,T2MDEW,T2MWET',
 'WD10M,WD2M,WD50M',
 'WS10M,WS2M,ALLSKY_SFC_LW_DWN',
 'ALLSKY_SFC_SW_DIFF,ALLSKY_SFC_UV_INDEX,ALLSKY_SFC_UVA',
 'ALLSKY_SFC_UVB,AOD_55,AOD_84',
 'CLOUD_AMT,CLOUD_OD,CLRSKY_SFC_LW_DWN',
 'CLRSKY_SFC_SW_DIFF,CLRSKY_SFC_SW_DWN,PW',
 'SZA,TOA_SW_DWN,ALLSKY_KT',
 'ALLSKY_NKT,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_SW_DNI',
 'ALLSKY_SRF_ALB,CLRSKY_KT,CLRSKY_NKT',
 'CLRSKY_SFC_PAR_TOT,CLRSKY_SFC_SW_DNI,CLRSKY_SRF_ALB',
 'TOA_SW_DNI']

In [43]:
import pandas as pd
import re
from io import StringIO

def read_data_with_dynamic_header(file_path):
    """Reads data from a file, identifying the header row and creating a DataFrame.

    Args:
        file_path (str): Path to the data file.

    Returns:
        pandas.DataFrame: DataFrame containing the data.
    """

    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Find the header row index
    header_row_index = None
    for i, line in enumerate(lines):
        if re.search(r'YEAR,MO,DY,HR\.*', line):
            header_row_index = i
            break

    if header_row_index is None:
        raise ValueError("Header row not found.")

    # Extract data from the header row and subsequent rows
    df = pd.read_csv(file_path, skiprows=header_row_index)
    # Create a datetime column by combining the year, month, day, and hour columns
    df['MIN'] = 0
    df['SEC'] = 0
        # Ensure consistent formatting for datetime parsing
    df['YEAR'] = df['YEAR'].astype(str)
    df['MO'] = df['MO'].astype(str).str.zfill(2)  # Pad month with leading zeros
    df['DY'] = df['DY'].astype(str).str.zfill(2)  # Pad day with leading zeros
    df['HR'] = df['HR'].astype(str).str.zfill(2)  # Pad hour with leading zeros


    # Create the datetime column
    df['DATETIME'] = pd.to_datetime(
        df[['YEAR', 'MO', 'DY']].astype(str).agg('-'.join, axis=1) + ' ' + df[['HR', 'MIN', 'SEC']].astype(str).agg(':'.join, axis=1),
        # format='%Y-%m-%d %H:%M:%S'
        ) #

    # Insert the DATETIME column as the first column
    df.insert(0, 'DATETIME', df.pop('DATETIME'))

    df.drop(['YEAR', 'MO', 'DY', 'HR', 'MIN', 'SEC'], axis = 1, inplace = True)

    return df

# # Example usage:
# file_path = '/content/drive/MyDrive/Q-RIEnTe Data Analysis/NASA POWER/Bauan Batangas/FEATURES/20130101_20231231_QV2M,RH2M,T2M.csv'  # Replace with your actual file path

# df = read_data_with_dynamic_header(file_path)

# print(df.head())



In [44]:
import requests

import os
import requests
from time import sleep

def request_data(params, user="neeeal", start=None, end=None, latitude=None, longitude=None):
    """Fetches hourly power data from NASA's Power API.

    Args:
        params (str): Comma-separated list of power parameters to retrieve.
        user (str, optional): NASA Power user ID (default: "neeeal").
        start (str, optional): Start date and time in YYYY-MM-DDTHH:MMZ format.
        end (str, optional): End date and time in YYYY-MM-DDTHH:MMZ format.
        latitude (float, optional): Latitude of the point of interest.
        longitude (float, optional): Longitude of the point of interest.

    Returns:
        requests.Response: The response object containing the data.

    Raises:
        ValueError: If required parameters (start, end, latitude, or longitude)
                    are missing.
    """

    base_url = "https://power.larc.nasa.gov/api/temporal/hourly/point"

    # Validate and handle missing required parameters
    if not all([start, end, latitude, longitude]):
        raise ValueError("Missing required parameters: start, end, latitude, or longitude.")

    query_params = {
        "start": start,
        "end": end,
        "latitude": latitude,
        "longitude": longitude,
        "community": "re",
        "parameters": params,
        "format": "csv",
        "user": user,
        "header": "true",
        "time-standard": "lst"
    }

    url = requests.compat.urljoin(base_url, "?" + requests.compat.urlencode(query_params))

    print(url)

    retries = 16
    for attempt in range(retries):
        try:
            response = requests.get(url)
            response.raise_for_status()  # Will raise an HTTPError for bad responses (4xx, 5xx)
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt+1} failed: {e}")
            if attempt < retries - 1:
                print("Retrying...")
                sleep(2)  # Wait for 2 seconds before retrying
            else:
                print("All attempts failed.")
                raise  # Re-raise the last exception after 3 failed attempts

def save_response_content(path, response):
  if response.status_code == 200:
    with open(path, "wb") as f:
      f.write(response.content)
    print("Data downloaded successfully!")
  else:
    print("Error:", response.content)

import pandas as pd
## edit merging to account many times

import os
import pandas as pd


def merge_dataframes(features_path, target_path, feature_filenames, target_filenames):
    """Merge data from multiple periods (2001–2008, 2008–2013, 2014–2024) into one final long dataset."""

    # Define explicit year periods
    periods = [
        # (2001, 2008),
        # (2008, 2013),
        (2024, 2025)
    ]

    all_periods_dfs = []

    for start_year, end_year in periods:
        period_features_df = None
        period_target_df = None

        # --- Target files ---
        for filename in target_filenames:
            print("target "+filename)
            year_match = re.search(r"\d{4}", filename)
            if year_match:
                file_year = int(year_match.group())
                if start_year <= file_year <= end_year:
                    df = read_data_with_dynamic_header(os.path.join(target_path, filename))
                    if period_target_df is None:
                        period_target_df = df
                    else:
                        period_target_df = pd.merge(period_target_df, df, on="DATETIME", how="outer")

        # --- Feature files ---
        for filename in feature_filenames:
            print("feature " + filename)
            
            year_match = re.search(r"\d{4}", filename)
            if year_match:
                file_year = int(year_match.group())
                if start_year <= file_year <= end_year:
                    df = read_data_with_dynamic_header(os.path.join(features_path, filename))
                    if period_features_df is None:
                        period_features_df = df
                    else:
                        period_features_df = pd.merge(period_features_df, df, on="DATETIME", how="outer")

        # Merge for this period
        if period_target_df is not None and period_features_df is not None:
            period_df = pd.merge(period_target_df, period_features_df, on="DATETIME", how="outer")
            all_periods_dfs.append(period_df)
    
    
    print("merging...")
    # Combine all periods
    final_merged_df = pd.concat(all_periods_dfs, axis=0)

    print("cleaning...")
    # Sort and clean
    final_merged_df = final_merged_df.sort_values("DATETIME").reset_index(drop=True)

    return final_merged_df


In [45]:
import pandas as pd

# Load the CSV
df = pd.read_csv("./quezon_city_barangay_centroids.csv")

# Convert to list of dictionaries
locations = [
    {
        "name": row["Barangay"],
        "latitude": row["Latitude"],
        "longitude": row["Longitude"]
    }
    for _, row in df.iterrows()
]

# Show the first few entries to verify
print(locations[:5])

## declare times dicts
times = [
    # {"start":20010101, "end":20081231},
    # {"start":20090101, "end":20131231},
    # {"start":20140101, "end":20241231},
    {"start":20240101, "end":20250831},
    ]



[{'name': 'Alicia', 'latitude': 14.660309550861587, 'longitude': 121.02300946967053}, {'name': 'Amihan', 'latitude': 14.63186565204424, 'longitude': 121.06640018366905}, {'name': 'Apolonio Samson', 'latitude': 14.65467348821497, 'longitude': 121.00587776990444}, {'name': 'Aurora', 'latitude': 14.616437108259282, 'longitude': 121.0059443453384}, {'name': 'Baesa', 'latitude': 14.668135268649293, 'longitude': 121.01026451705674}]


In [46]:
# base_path = "/content/drive/MyDrive/Q-RIEnTe Project/Datasets"
base_path = "./"

In [47]:
import os

def create_folder(
  new_folder, 
  base_path#=base_path
  ):
  new_path = os.path.join(base_path, new_folder) #"NASA POWER"
  os.makedirs(new_path, exist_ok=True)

  return new_path

In [48]:
nasa_path = create_folder("NASA Power QC 2001-2024", base_path)

In [49]:
# import pandas as pd
# import numpy as np
# from scipy.interpolate import interp1d

# def resample_df(df, resolution):
#   df = df.resample(resolution).max()
#   return df

# # Define a function to interpolate a single column
# def interpolate_df(df):
#   df = df.interpolate(method='time')
#   return df

# # Apply the interpolation function to each column

In [50]:
import pandas as pd
import numpy as np
from datetime import timedelta, datetime, time
from functools import lru_cache

# Set of parameters that should be zero or have special values at night
SOLAR_PARAMETERS = {
    'ALLSKY_SFC_SW_DIFF', 'ALLSKY_SFC_SW_DWN', 'ALLSKY_SFC_UV_INDEX',
    'ALLSKY_SFC_UVA', 'ALLSKY_SFC_UVB', 'CLRSKY_SFC_SW_DIFF',
    'CLRSKY_SFC_SW_DWN', 'TOA_SW_DWN', 'ALLSKY_KT', 'ALLSKY_NKT',
    'ALLSKY_SFC_PAR_TOT', 'ALLSKY_SFC_SW_DNI', 'CLRSKY_KT', 'CLRSKY_NKT',
    'CLRSKY_SFC_PAR_TOT', 'CLRSKY_SFC_SW_DNI', 'TOA_SW_DNI',
    'ALLSKY_SRF_ALB', 'CLRSKY_SRF_ALB'
}

# Special parameter that should have value > 90 at night
SZA_PARAMETER = 'SZA'
SZA_NIGHT_VALUE = 90.0  # Default value for solar zenith angle at night (> 90 degrees)

def is_nighttime(timestamp, latitude=None, longitude=None):
    """
    Determine if a timestamp is during nighttime based on a simple heuristic.
    For more accurate results, proper sun position calculations using latitude/longitude should be used.
    
    This simple version treats 8 PM - 6 AM as nighttime.
    """
    # Simple heuristic - consider 8 PM to 6 AM as nighttime
    hour = timestamp.hour
    return hour >= 20 or hour < 6

def interpolate_short_gaps(series, max_gap=8, param_name=None):
    is_nan = series.isna()
    group = (is_nan != is_nan.shift()).cumsum()
    gap_lengths = is_nan.groupby(group).transform('sum')
    to_interp = (is_nan) & (gap_lengths <= max_gap)
    
    # Only process if there are short gaps to fill
    if not to_interp.any():
        return series.copy()
    
    result = series.copy()
    
    # Set nighttime values for solar parameters before interpolation
    if param_name in SOLAR_PARAMETERS or param_name == SZA_PARAMETER:
        # Find timestamps that are nighttime and in the interpolation group
        night_times = pd.Series(
            [is_nighttime(ts) for ts in result.index],
            index=result.index
        )
        
        # For nighttime gaps in solar parameters, set appropriate values
        night_gaps = to_interp & night_times
        if night_gaps.any():
            if param_name in SOLAR_PARAMETERS:
                # Solar radiation parameters should be 0 at night
                result.loc[night_gaps] = 0.0
            elif param_name == SZA_PARAMETER:
                # Solar zenith angle should be > 90 at night
                result.loc[night_gaps] = SZA_NIGHT_VALUE
            
            # Update to_interp to exclude nighttime gaps that we've now filled
            to_interp = to_interp & ~night_times
    
    # If we still have gaps to interpolate (daytime or non-solar parameters)
    if to_interp.any():
        # Need to interpolate on the entire series, not just the gaps
        temp = result.copy()
        temp.loc[to_interp] = np.nan  # Ensure only target gaps are filled
        interpolated = temp.interpolate(method='time')
        
        # Only copy over the values for the short gaps we want to fill
        result.loc[to_interp] = interpolated.loc[to_interp]
    
    return result

@lru_cache(maxsize=128)
def _get_time_key(hour, minute=0):
    """Cached helper to create time objects for faster lookup"""
    return time(hour=hour, minute=minute)

def fill_local_climatology(series, window_days=15, min_samples=3, param_name=None):
    """
    Fill gaps using local climatology based on similar day-of-year and time-of-day.
    Handles solar parameters specifically for nighttime.
    """
    if series.isna().sum() == 0:
        return series
    
    filled = series.copy()
    
    # Handle special case for solar parameters at night first
    if param_name in SOLAR_PARAMETERS or param_name == SZA_PARAMETER:
        # Find nighttime missing values
        missing_night = pd.Series([is_nighttime(ts) for ts in filled[filled.isna()].index], 
                                  index=filled[filled.isna()].index)
        
        # Set appropriate values for nighttime
        if missing_night.any():
            night_idx = missing_night[missing_night].index
            if param_name in SOLAR_PARAMETERS:
                filled.loc[night_idx] = 0.0
            elif param_name == SZA_PARAMETER:
                filled.loc[night_idx] = SZA_NIGHT_VALUE
    
    # If all values are filled now, return early
    if filled.isna().sum() == 0:
        return filled
    
    # Pre-compute frequency once
    if len(series) > 0:
        freq = pd.infer_freq(series.index)
        if freq is None:
            # Try to estimate frequency from first few intervals
            if len(series) > 10:
                intervals = series.index[1:10] - series.index[0:9]
                most_common = pd.Series(intervals).mode()[0]
                if most_common == timedelta(minutes=5):
                    freq = '5min'
                elif most_common == timedelta(hours=1):
                    freq = 'H'
                else:
                    freq = 'H'
            else:
                freq = 'H'
    else:
        freq = 'H'
    
    # Pre-compute dayofyear for the entire index
    days_of_year = pd.Series(series.index.dayofyear, index=series.index)
    
    # Group missing values by time of day to process in batches
    missing_times = {}
    missing_idx = filled[filled.isna()].index
    
    for ts in missing_idx:
        tod = ts.time()
        if tod not in missing_times:
            missing_times[tod] = []
        missing_times[tod].append(ts)
    
    # Process batches of missing values with the same time of day
    for tod, timestamps in missing_times.items():
        hour = tod.hour
        minute = tod.minute
        
        # Special handling for solar parameters during daytime
        is_daytime_tod = not is_nighttime(datetime.combine(datetime.today().date(), tod))
        
        # If it's a solar parameter and nighttime, skip climatology (should already be set to 0)
        if not is_daytime_tod and (param_name in SOLAR_PARAMETERS or param_name == SZA_PARAMETER):
            continue
        
        # Pre-filter candidates by time of day once per batch
        if freq == '5min':
            time_filter = (series.index.hour == hour) & (abs(series.index.minute - minute) <= 5)
        else:  # Hourly
            time_filter = series.index.hour == hour
        
        time_filtered = series[time_filter]
        
        # Process each timestamp in this time-of-day batch
        for ts in timestamps:
            doy = ts.dayofyear
            
            # Define window around the target day
            window_start_doy = (doy - window_days) % 366
            window_end_doy = (doy + window_days) % 366
            
            # Find matching timestamps within the window
            if window_start_doy < window_end_doy:
                doy_mask = (days_of_year >= window_start_doy) & (days_of_year <= window_end_doy)
            else:  # Window wraps around the year
                doy_mask = (days_of_year >= window_start_doy) | (days_of_year <= window_end_doy)
            
            # Combine filters for final candidate selection
            candidates_idx = time_filtered.index[doy_mask.loc[time_filtered.index]]
            candidates = series.loc[candidates_idx].dropna()
            
            # Remove the target timestamp itself
            candidates = candidates[candidates.index != ts]
            
            # Set value if we have enough samples
            if len(candidates) >= min_samples:
                filled.at[ts] = candidates.median()
    
    # Handle remaining missing values with global climatology
    if filled.isna().any():
        # Compute global climatology by time of day
        climatology_dict = {}
        
        # Group by hour and minute for 5min data, just hour for hourly data
        if freq == '5min':
            for hr in range(24):
                # Skip nighttime hours for solar parameters
                if (param_name in SOLAR_PARAMETERS or param_name == SZA_PARAMETER) and (hr >= 20 or hr < 6):
                    continue
                    
                for mn in range(0, 60, 5):
                    mask = (series.index.hour == hr) & (series.index.minute == mn)
                    values = series.loc[mask].dropna()
                    if not values.empty:
                        climatology_dict[(hr, mn)] = values.median()
        else:
            for hr in range(24):
                # Skip nighttime hours for solar parameters
                if (param_name in SOLAR_PARAMETERS or param_name == SZA_PARAMETER) and (hr >= 20 or hr < 6):
                    continue
                    
                mask = series.index.hour == hr
                values = series.loc[mask].dropna()
                if not values.empty:
                    climatology_dict[(hr, 0)] = values.median()
        
        # Fill remaining missing values
        for ts in filled[filled.isna()].index:
            # For solar parameters at night, use 0 or appropriate value
            if (param_name in SOLAR_PARAMETERS or param_name == SZA_PARAMETER) and is_nighttime(ts):
                if param_name in SOLAR_PARAMETERS:
                    filled.at[ts] = 0.0
                else:  # SZA
                    filled.at[ts] = SZA_NIGHT_VALUE
                continue
                
            key = (ts.hour, ts.minute if freq == '5min' else 0)
            if key in climatology_dict:
                filled.at[ts] = climatology_dict[key]
    
    return filled

def impute_time_series(series, max_short_gap=12, window_days=15, param_name=None):
    print(f"Initial NaN count: {series.isna().sum()}")

    # Special handling for solar parameters - set nighttime NaNs to 0 first
    if param_name in SOLAR_PARAMETERS or param_name == SZA_PARAMETER:
        # Find nighttime values
        night_mask = pd.Series([is_nighttime(ts) for ts in series.index], index=series.index)
        missing_night = series.isna() & night_mask
        
        if missing_night.any():
            series_copy = series.copy()
            if param_name in SOLAR_PARAMETERS:
                series_copy.loc[missing_night] = 0.0
            elif param_name == SZA_PARAMETER:
                series_copy.loc[missing_night] = SZA_NIGHT_VALUE
            series = series_copy
            print(f"Set {missing_night.sum()} nighttime values to appropriate constants")

    partially_filled = interpolate_short_gaps(series, max_gap=max_short_gap, param_name=param_name)
    print(f"NaN count after short gap interpolation: {partially_filled.isna().sum()}")

    if partially_filled.isna().any():
        partially_filled = fill_local_climatology(partially_filled, window_days=window_days, param_name=param_name)
        print(f"NaN count after climatology filling: {partially_filled.isna().sum()}")

    return partially_filled

def resample_and_interpolate(df, resolution="5min", max_short_gap=8, window_days=15):
    """
    Resample a dataframe to desired resolution and fill gaps with appropriate methods.
    """
    print("Initial -999 count:", (df == -999).sum().sum())
    df = df.replace(-999, np.nan)
    print("Initial NaN count after replacement:", df.isna().sum().sum())
    
    # Convert index to datetime if needed
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)
    
    # Check if input is Series or DataFrame
    is_series = isinstance(df, pd.Series)
    
    # Convert Series to DataFrame for consistent processing
    if is_series:
        df = df.to_frame(name='value')
    
    # Resample to target resolution
    resampled = df.resample(resolution).mean()
    
    # Pre-allocate result dataframe
    result = pd.DataFrame(index=resampled.index, columns=df.columns)
    
    # Process each column
    for col in df.columns:
        print(resolution, " ", col)
        # Skip processing if column is all NaN
        if resampled[col].isna().all():
            result[col] = resampled[col]
            continue
            
        # Pass column name to imputation functions for parameter-specific handling
        result[col] = impute_time_series(
            resampled[col], 
            max_short_gap=max_short_gap,
            window_days=window_days,
            param_name=col
        )
    
    # Return Series if input was Series
    if is_series:
        return result['value']
    
    return result

In [51]:
# import pandas as pd
# import os

# path = "../datasets/NASA Power (Davao)/Structured"
# locations = os.listdir(path)
# true_path = [os.path.join(path, os.path.join(l, os.listdir(os.path.join(path, l))[0])) for l in locations]

# for p in true_path:

#     df = pd.read_csv(p)

#     df["SZA"] = df["SZA"].replace(100,90)

#     df.to_csv(p)

In [52]:
def save_features(location_path, time, params, location):
    """Saves feature data for a given location, time period, and parameters.

    Args:
        location_path (str): Path to the location directory.
        time (dict): Dictionary containing 'start' and 'end' timestamps.
        params (str): Comma-separated list of parameters.
    """

    response = request_data(params=params, start=time["start"], end=time["end"],
                            latitude=location["latitude"], longitude=location["longitude"])
    filename = f"{time['start']}_{time['end']}_{params}.csv"
    save_response_content(os.path.join(location_path, filename), response)

def save_target(location_path, time, target_type, location):
    """Saves target data (solar irradiance or wind speed) for a given location and time period.

    Args:
        location_path (str): Path to the location directory.
        time (dict): Dictionary containing 'start' and 'end' timestamps.
        target_type (str): 'solar_irradiance' or 'wind_speed'.
    """

    if target_type == 'ghi':
        params = list(solar_irradiance.keys())[0]
    elif target_type == 'wind_speed':
        params = list(wind_speed.keys())[0]
    else:
        raise ValueError("Invalid target type")

    response = request_data(params=params, start=time["start"], end=time["end"],
                            latitude=location["latitude"], longitude=location["longitude"])
    filename = f"{time['start']}_{time['end']}_{target_type}_{params}.csv"
    print(response)
    save_response_content(os.path.join(location_path, filename), response)

def merge_and_save(features_path, time, output_path, target_path, resolution = "5min"):
    """Merges feature files and saves the merged DataFrame.

    Args:
        location_path (str): Path to the location directory.
        time (dict): Dictionary containing 'start' and 'end' timestamps.
    """
    # Get filenames from directories
    feature_filenames = os.listdir(features_path)
    target_filenames = os.listdir(target_path)

    # Merge the DataFrames
    merged_df = merge_dataframes(features_path, target_path, feature_filenames, target_filenames)

    ## saving structured
    filename = f"{time['start']}_{time['end']}_STRUCTURED.csv"
    merged_df = merged_df.set_index("DATETIME")
    
    ##Hourly
    df_interpolated = resample_and_interpolate(merged_df, resolution="h")
    print("saving "+filename)
    df_interpolated.to_csv(os.path.join(output_path, filename))

    ## Only doing hourly first
    
    ## saving strctured resampled and interpolated
    ##5-minute
    # df_interpolated = resample_and_interpolate(merged_df)
    # filename_interpolated = f"{time['start']}_{time['end']}_STRUCTURED_{resolution}.csv"
    # # print(filename_interpolated)
    # print("saving "+filename_interpolated)
    # df_interpolated.to_csv(os.path.join(output_path, filename_interpolated))


In [53]:
## unstructured
def main(times, locations, unstructured_base_path = nasa_path, structured_base_path = nasa_path):
  # Main loop
  for location in locations:
    print(location)
    unstructured_path = create_folder("Unstructured",unstructured_base_path)
    structured_path = create_folder("Structured",structured_base_path)

    unstructured_location_path = create_folder(location["name"], unstructured_path)
    structured_location_path = create_folder(location["name"], structured_path)

    for time in times:
      features_location_path = create_folder("FEATURES", unstructured_location_path)
      for params in batch_params:
          save_features(features_location_path, time, params, location)

      target_location_path = create_folder("TARGET", unstructured_location_path)
      save_target(target_location_path, time, 'ghi', location)
      save_target(target_location_path, time, 'wind_speed', location)

      print(structured_location_path)
      # merge_and_save(features_location_path, time, structured_location_path, target_location_path)

In [54]:
main(times, locations)

{'name': 'Alicia', 'latitude': 14.660309550861587, 'longitude': 121.02300946967053}
https://power.larc.nasa.gov/api/temporal/hourly/point?start=20240101&end=20250831&latitude=14.660309550861587&longitude=121.02300946967053&community=re&parameters=PRECTOTCORR%2CPS%2CQV10M&format=csv&user=neeeal&header=true&time-standard=lst
Data downloaded successfully!
https://power.larc.nasa.gov/api/temporal/hourly/point?start=20240101&end=20250831&latitude=14.660309550861587&longitude=121.02300946967053&community=re&parameters=QV2M%2CRH2M%2CT2M&format=csv&user=neeeal&header=true&time-standard=lst
Data downloaded successfully!
https://power.larc.nasa.gov/api/temporal/hourly/point?start=20240101&end=20250831&latitude=14.660309550861587&longitude=121.02300946967053&community=re&parameters=TS%2CU10M%2CU2M&format=csv&user=neeeal&header=true&time-standard=lst
Data downloaded successfully!
https://power.larc.nasa.gov/api/temporal/hourly/point?start=20240101&end=20250831&latitude=14.660309550861587&longitude

In [57]:
## strcutured
def main(times, locations, unstructured_base_path = nasa_path, structured_base_path = nasa_path):
  # Main loop
  for location in locations:
    print(location)
    unstructured_path = create_folder("Unstructured",unstructured_base_path)
    structured_path = create_folder("Structured",structured_base_path)

    unstructured_location_path = create_folder(location["name"], unstructured_path)
    structured_location_path = create_folder(location["name"], structured_path)

    for time in times:
      features_location_path = create_folder("FEATURES", unstructured_location_path)
    #   for params in batch_params:
    #       save_features(features_location_path, time, params, location)

      target_location_path = create_folder("TARGET", unstructured_location_path)
    #   save_target(target_location_path, time, 'ghi', location)
    #   save_target(target_location_path, time, 'wind_speed', location)

      print(structured_location_path)
      merge_and_save(features_location_path, {"start":20010101,"end":20251231}, structured_location_path, target_location_path)
      break

In [58]:
main(times, locations)

{'name': 'Alicia', 'latitude': 14.660309550861587, 'longitude': 121.02300946967053}
./NASA Power QC 2001-2024\Structured\Alicia
target 20010101_20081231_ghi_ALLSKY_SFC_SW_DWN.csv
target 20010101_20081231_wind_speed_WS50M.csv
target 20090101_20131231_ghi_ALLSKY_SFC_SW_DWN.csv
target 20090101_20131231_wind_speed_WS50M.csv
target 20140101_20241231_ghi_ALLSKY_SFC_SW_DWN.csv
target 20140101_20241231_wind_speed_WS50M.csv
target 20240101_20250831_ghi_ALLSKY_SFC_SW_DWN.csv
target 20240101_20250831_wind_speed_WS50M.csv
feature 20010101_20081231_ALLSKY_NKT,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_SW_DNI.csv
feature 20010101_20081231_ALLSKY_SFC_SW_DIFF,ALLSKY_SFC_UV_INDEX,ALLSKY_SFC_UVA.csv
feature 20010101_20081231_ALLSKY_SFC_UVB,AOD_55,AOD_84.csv
feature 20010101_20081231_ALLSKY_SRF_ALB,CLRSKY_KT,CLRSKY_NKT.csv
feature 20010101_20081231_CLOUD_AMT,CLOUD_OD,CLRSKY_SFC_LW_DWN.csv
feature 20010101_20081231_CLRSKY_SFC_PAR_TOT,CLRSKY_SFC_SW_DNI,CLRSKY_SRF_ALB.csv
feature 20010101_20081231_CLRSKY_SFC_SW_DIFF,CLR