In [None]:
import tarfile
import pandas as pd
import io
import os
import numpy as np
import matplotlib.pyplot as plt

# Extract 100 daily measures which are the closed to Austrian Center 
## Center Y (Latitude/Northing): 2750000, Center X (Longitude/Easting): 4550000

In [None]:
tar_path = "./2_LamaH-CE_daily.tar.gz"
output_dir = "./data"
os.makedirs(output_dir, exist_ok=True)

# CENTER for Projected System (Meters)
center_lat_metric = 2750000.0  # Northing (Y)
center_lon_metric = 4550000.0  # Easting (X)

print(f"Targeting Center (Meters) -> Y: {center_lat_metric}, X: {center_lon_metric}")

successful_count = 0
final_metadata_subset = None

with tarfile.open(tar_path, "r:gz") as tar:
    # 1. Find Gauge Attributes
    attr_member = next((m for m in tar.getmembers() if "Gauge_attributes" in m.name and m.name.endswith(".csv")), None)
    if not attr_member: raise FileNotFoundError("Gauge_attributes.csv not found.")

    f_attr = tar.extractfile(attr_member)
    meta_df = pd.read_csv(io.BytesIO(f_attr.read()), sep=";", encoding="utf-8")
    meta_df.columns = [c.strip() for c in meta_df.columns]

    # 2. Identify Coordinate Columns
    # We look for columns that likely contain the projected metric values
    possible_lat = ['lat_gauge', 'y', 'Y', 'lat', 'LAT']
    possible_lon = ['lon_gauge', 'x', 'X', 'lon', 'LON']
    
    lat_col = next((c for c in possible_lat if c in meta_df.columns), None)
    lon_col = next((c for c in possible_lon if c in meta_df.columns), None)

    print(f"Using columns: {lat_col} (Y), {lon_col} (X)")

    # 3. Calculate Distance in METERS
    meta_df['distance'] = np.sqrt(
        (meta_df[lat_col] - center_lat_metric)**2 + 
        (meta_df[lon_col] - center_lon_metric)**2
    )
    
    # Select 100 closest
    target_cluster = meta_df.sort_values('distance').head(100)
    final_metadata_subset = target_cluster.copy()
    
    # 4. Extract
    id_col = 'ID' if 'ID' in meta_df.columns else 'id'
    target_ids = set(target_cluster[id_col].astype(str))
    
    print(f"Extracting {len(target_ids)} files...")
    
    for member in tar.getmembers():
        if "daily" in member.name and member.name.endswith(".csv"):
            fname = os.path.basename(member.name)
            fid = fname.replace("ID_", "").replace(".csv", "")
            if fid in target_ids:
                f = tar.extractfile(member)
                if f:
                    pd.read_csv(io.BytesIO(f.read()), sep=";").to_csv(os.path.join(output_dir, fname), index=False)
                    successful_count += 1
        if successful_count >= 100: break

print(f" Saved {successful_count} files to {output_dir}")

## Sanity check plot for data loading

In [None]:
# SANITY CHECK PLOT
plt.figure(figsize=(10, 6))

# Plot the Center (Red X)
plt.plot(center_lon_metric, center_lat_metric, 'rx', markersize=15, markeredgewidth=3, label="Target Center (Fixed)")

# Plot the Selected Stations (Blue Dots)
plt.scatter(
    final_metadata_subset[lon_col], 
    final_metadata_subset[lat_col], 
    c='blue', alpha=0.6, edgecolors='k', label="Selected Stations"
)

plt.title("Geographic Distribution (Metric Coordinates)")
plt.xlabel(f"Easting / Longitude ({lon_col})")
plt.ylabel(f"Northing / Latitude ({lat_col})")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.axis('equal') 
plt.show()

### How to load data from now on only from pickle file

| Attribute               | Description                                                                                                                |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------|
| YYYY                    | Year of observation (e.g., 1981)                                                                                           |
| MM                      | Month (1 to 12)                                                                                                            |
| DD                      | Day of month (1 to 31)                                                                                                     |
| DOY                     | Day of year (1 to 365/366); allows for seasonal analysis                                                                  |
| 2m_temp_max             | Maximum air temperature at 2 meters above ground (°C)                                                                      |
| 2m_temp_mean            | Mean air temperature at 2 meters above ground (°C)                                                                         |
| 2m_temp_min             | Minimum air temperature at 2 meters above ground (°C)                                                                      |
| 2m_dp_temp_max          | Maximum dew point temperature at 2 meters (°C); temperature at which air becomes saturated                                  |
| 2m_dp_temp_mean         | Mean dew point temperature at 2 meters (°C)                                                                                 |
| 2m_dp_temp_min          | Minimum dew point temperature at 2 meters (°C)                                                                             |
| 10m_wind_u              | East-west wind speed component at 10 meters (m/s)                                                                           |
| 10m_wind_v              | North-south wind speed component at 10 meters (m/s)                                                                        |
| fcst_alb                | Forecasted surface albedo (reflectivity; unitless or percent)                                                               |
| lai_high_veg            | Leaf Area Index for high vegetation; proxy for vegetation cover (dimensionless)                                             |
| lai_low_veg             | Leaf Area Index for low vegetation (dimensionless)                                                                          |
| swe                     | Snow Water Equivalent (mm); total water content in snowpack                                                                |
| surf_net_solar_rad_max  | Maximum net solar radiation at surface (W/m²); key for surface heating and energy balance                                    |
| surf_net_solar_rad_mean | Mean net solar radiation at surface (W/m²)                                                                                  |
| surf_net_therm_rad_max  | Maximum net thermal (infrared) radiation at surface (W/m²); affects nighttime cooling                                        |
| surf_net_therm_rad_mean | Mean net thermal radiation at surface (W/m²)                                                                                |
| surf_press              | Surface atmospheric pressure (hPa)                                                                                           |
| total_et                | Total evapotranspiration (mm); water flux from land/vegetation to atmosphere                                                |
| prec                    | Precipitation (mm); daily total                                                                                             |
| volsw_123               | Volumetric soil water content in soil layers 1+2+3 (mm or %)                                                               |
| volsw_4                 | Volumetric soil water content in soil layer 4 (mm or %)                                                                     |
