In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

data = pd.read_csv("Data/weather_station_data_updated.csv")
data["time"] = pd.to_datetime(data["time"])

columns_to_average = ["tavg", "tmin", "tmax", "prcp", "wspd", "pres", "elevation"]

distance_threshold = 240  # Roughly 100 miles
EARTH_RADIUS_KM = 6378.0

data['latitude_rad'] = np.radians(data['latitude'])
data['longitude_rad'] = np.radians(data['longitude'])

nn = NearestNeighbors(metric='haversine')
nn.fit(data[["latitude_rad", "longitude_rad"]])

for index in range(len(data)):
    if index%1000 == 0:
        print(f"Working with index: {index}")



    row = data.iloc[index]
    month_data = data[(data["time"].dt.month == row["time"].month) & (data["time"].dt.year == row["time"].year)].drop(index)

    if month_data.empty:
        #print(f"Skipping index {index} due to no matching month data.")
        continue

    nn.fit(month_data[["latitude_rad", "longitude_rad"]])

    query_point = pd.DataFrame(
        [[row["latitude_rad"], row["longitude_rad"]]], 
        columns=["latitude_rad", "longitude_rad"]
    )
    distances, indices = nn.kneighbors(
        query_point,
        n_neighbors=min(50, len(month_data)),
        return_distance=True
    )

    distances_km = distances.flatten() * EARTH_RADIUS_KM

    if np.max(distances_km) > distance_threshold:
        #print(f"Skipping averages for index {index} due to distance exceeding threshold.")
        averages = {col: np.nan for col in columns_to_average}
    else:
        neighbor_indices = indices.flatten()
        neighbors = month_data.iloc[neighbor_indices]

        averages = {
            col: round(neighbors[col].dropna().head(3).mean(), 2) if len(neighbors[col].dropna()) >= 3 else np.nan
            for col in columns_to_average
        }

    for col in columns_to_average:
        if pd.isna(row[col]):
            data.at[index, col] = averages[col]

    #print(f"Averages for row {index}:", averages)

updated_data = data.copy()
#updated_data.to_csv("Data/updated_weather_station_data.csv", index=False)


Working with index: 0
Working with index: 1000
Working with index: 2000
Working with index: 3000
Working with index: 4000
