In [None]:
import os
import requests
import csv
import codecs
import pandas as pd
import calendar
from datetime import datetime, timedelta

In [None]:
# Weather Data Collection Function
def fetch_weather_data(station_ids, start_date, end_date):
    """Download weather data for specified stations and date range, filtering out rows with missing temperature (tmpf)."""
    localfn = f"weather_data_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}_filtered.csv"

    if os.path.isfile(localfn):
        print(f"- File already exists: {localfn}")
        return localfn

    print(f"+ Downloading data for stations {', '.join(station_ids)} from {start_date} to {end_date}")

    stations_str = ",".join(station_ids)
    uri = (
        "https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
        f"station={stations_str}&"
        "data=tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,"
        "skyc1,skyc2,skyc3,skyc4,skyl1,skyl2,skyl3,skyl4,wxcodes,"
        "ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,"
        "peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,snowdepth&"
        f"year1={start_date.year}&month1={start_date.month}&day1={start_date.day}&"
        f"year2={end_date.year}&month2={end_date.month}&day2={end_date.day}&"
        "tz=UTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=yes"
    )

    with requests.get(uri, stream=True, timeout=300) as res:
        if res.status_code == 200:
            decoded_stream = codecs.iterdecode(res.iter_lines(), 'utf-8')
            reader = csv.reader(decoded_stream)
            headers = next(reader)

            tmpf_index = headers.index('tmpf') if 'tmpf' in headers else None

            if tmpf_index is None:
                print("Error: 'tmpf' column not found.")
                return

            with open(localfn, "w", newline='', encoding="utf-8") as fh:
                writer = csv.writer(fh)
                writer.writerow(headers)

                row_count = 0
                for row in reader:
                    if len(row) == len(headers):
                        tmpf_value = row[tmpf_index]
                        if tmpf_value != 'M':  # Filter out rows with missing temperature
                            writer.writerow(row)
                            row_count += 1

            print(f"+ Data saved to {localfn} with {row_count} rows (excluding missing temperatures).")
        else:
            print(f"Failed to download data: {res.status_code}")

    return localfn


In [None]:
# Weather Data Cleaning Function
def clean_weather_data(file_path):
    """Clean the collected weather data by handling missing values."""
    df = pd.read_csv(file_path)

    # Replace 'M' with NaN
    df.replace('M', pd.NA, inplace=True)

    df['p01i'] = df['p01i'].replace('T', 0.005)
    df['skyl1'] = df['skyl1'].replace('NaN', 0)  # Replace 'M' (missing) with 0 feet for clear skies
    df['valid'] = pd.to_datetime(df['valid'])


    # Convert weather columns to appropriate types
    weather_features = ['tmpf', 'dwpf', 'relh', 'drct', 'sknt', 'p01i', 'alti', 'mslp', 'vsby', 'gust', 'skyl1']
    for feature in weather_features:
        df[feature] = pd.to_numeric(df[feature], errors='coerce')

    df.sort_values(by=['station', 'valid'], inplace = True)

    # List of columns to apply spline interpolation
    columns_to_interpolate = ['mslp', 'dwpf', 'relh', 'drct', 'sknt', 'p01i', 'vsby', 'skyl1', 'feel']

    # Apply spline interpolation for each station group separately
    for col in columns_to_interpolate:
        df[col] = df.groupby('station')[col].transform(lambda group: group.interpolate(method='linear'))

    # Option 1: Forward Fill the remaining missing values
    df[['mslp','skyl1','feel','skyc1']] = df[['mslp','skyl1','feel','skyc1']].ffill()

    # Option 2: Backward Fill the remaining missing values (optional, can be combined with ffill)
    df[['mslp','skyl1','feel','skyc1']] = df[['mslp','skyl1','feel','skyc1']].bfill()

    columns_to_drop = ['snowdepth','wxcodes','skyl2','skyl3','skyl4','skyc2','skyc3','skyc4',
                    'gust','ice_accretion_1hr','ice_accretion_3hr','ice_accretion_6hr',
                    'peak_wind_gust','peak_wind_drct','peak_wind_time','metar']
    # Drop the columns
    df.drop(columns=columns_to_drop, inplace=True)

    # # Handle missing values
    # print("Missing values after handling:")
    # print(df.isnull().sum())

    # Save cleaned data
    cleaned_file = file_path.replace("filtered", "cleaned")
    df.to_csv(cleaned_file, index=False)
    print(f"Cleaned data saved to {cleaned_file}")

    return cleaned_file

In [None]:
def resample_to_15min(df):
    """Resample the dataframe to 15-minute intervals for each station and fill missing data."""

    # Ensure 'valid' is datetime
    df['resample15'] = pd.to_datetime(df['valid'])

    # Set 'valid' as the index (so we can easily reindex later)
    df.set_index('resample15', inplace=True)

    # Create a DataFrame to store the results
    full_df = pd.DataFrame()

    # List of numeric and categorical columns
    numeric_cols = ['tmpf','alti','mslp', 'dwpf', 'relh', 'drct', 'sknt', 'p01i', 'vsby', 'skyl1', 'feel']
    categorical_cols = ['skyc1','valid']

    # Apply resampling for each station separately
    for station, station_data in df.groupby('station'):
        # Generate a full range of 15-minute intervals for this station
        start_time = station_data.index.min()
        end_time = station_data.index.max()
        full_time_range = pd.date_range(start=start_time, end=end_time, freq='15min')

        # Reindex station_data with the full 15-minute interval range
        station_data = station_data.reindex(full_time_range)

        # Fill missing 'station' column with the current station name
        station_data['station'] = station

        # Interpolate missing numeric values for each column
        station_data[numeric_cols] = station_data[numeric_cols].interpolate(method='linear')

        # Use forward fill and backward fill to ensure no missing values remain
        station_data[numeric_cols] = station_data[numeric_cols].ffill().bfill()

        # Forward fill and backward fill categorical columns
        station_data[categorical_cols] = station_data[categorical_cols].ffill().bfill()

        # Append to the final DataFrame
        full_df = pd.concat([full_df, station_data])

        # Round the 'mslp' column to the nearest one decimal place
        full_df['mslp'] = full_df['mslp'].round(1)
        full_df['feel'] = full_df['feel'].round(2)
        full_df['relh'] = full_df['relh'].round(2)
        full_df['alti'] = full_df['alti'].round()
        full_df['sknt'] = full_df['sknt'].round()

        full_df['drct'] = (full_df['drct'] / 10).round() * 10
        # Custom rounding function to round 'skyl1' to the nearest 100
        full_df['skyl1'] = full_df['skyl1'].apply(lambda x: round(x / 100) * 100)



    # Reset index to bring 'valid' back as a column
    full_df.reset_index(inplace=True)
    full_df.rename(columns={'index': 'resample15'}, inplace=True)

    full_df['resample15'] = pd.to_datetime(full_df['resample15'], format='%Y-%m-%d %H:%M')
    # Round the 'resample15' column to the nearest 15-minute interval
    full_df['resample15'] = full_df['resample15'].dt.round('15min')

    #Dropping duplicates row after rounding up time

    full_df[['station', 'resample15']] = full_df[['station', 'resample15']].drop_duplicates()


    return full_df


In [None]:
def main():
    station_ids = ['DEN', 'EGE', 'GUC', 'MTJ', 'COS', 'HDN', 'GJT', 'DRO', 'ASE', 'JFK', 'LGA', 'SYR', 'BUF', 'ALB', 'ROC', 'ITH', 'HPN', 'BGM',
                   'ISP', 'IAG', 'PBG', 'ELM', 'SWF',
                   'ACV','BFL','BIH','BUR','FAT','LAX','LGB','MRY','OAK','ONT','PSP','RDD','SAN','SBA','SBP','SCK','SFO','SJC','SMF','SMX','SNA','STS',
                   'DAB','ECP','EYW','FLL','GNV','JAX','MCO','MIA','MLB','PBI','PGD','PIE','PNS','RSW','SFB','SRQ','TLH','TPA','VPS',
                   'ABI','ACT','AMA','AUS','BPT','BRO','CLL','CRP','DAL','DFW','ELP','GGG','GRK','HOU','HRL','IAH','LBB','LRD','MAF','MFE','SAT','SJT','SPS','TYR','VCT',
                   'BLV','BMI','CMI','DEC','MDW','MLI','ORD','PIA','RFD','SPI',
                   'ABY','AGS','ATL','BQK','CSG','SAV','VLD',
                   'ACY','EWR','TTN',
                   'BWI','HGR',
                   'EKO','LAS','RNO']  # Sample station IDs [ New york, Colorado, California, Florida, Texas, Illionis, Georgia, New Jersey, Maryland, Nevada   ]
    year = 2024
    month = 1
    start_date = datetime(year, month, 1)
    first_weekday, num_days = calendar.monthrange(year, month)
    end_date = start_date + timedelta(days=num_days)

    # Step 1: Collect Weather Data
    file_path = fetch_weather_data(station_ids, start_date, end_date)

    # Step 2: Clean the Collected Weather Data
    df = clean_weather_data(file_path)
    df = pd.read_csv(df)
    # Step 3: Resample the 'valid' column to 15-minute intervals and interpolate
    df = resample_to_15min(df)


    # Step 4: Save the final DataFrame to a CSV file
    cleaned_file = file_path.replace("filtered", "cleaned_15min")
    df.to_csv(f'{cleaned_file}', index=False)
    print(f"Cleaned and resampled data saved to {cleaned_file}")

In [None]:
if __name__ == "__main__":
    main()

- File already exists: weather_data_20240101_20240201_filtered.csv


  df = pd.read_csv(file_path)
  df[col] = df.groupby('station')[col].transform(lambda group: group.interpolate(method='linear'))


Cleaned data saved to weather_data_20240101_20240201_cleaned.csv
Cleaned and resampled data saved to weather_data_20240101_20240201_cleaned_15min.csv
