In [1]:
import os
import urllib.request
import logging

In [2]:
station_code_dict = {
    "PANC": "USW00026451", # Anchorage 
    "KBOI": "USW00024131", # Boise  
    "KORD": "USW00094846", # Chicago
    "KDEN": "USW00003017", # Denver 
    "KDTW": "USW00094847", # Detroit
    "PHNL": "USW00022521", # Honolulu 
    "KIAH": "USW00012960", # Houston
    "KMIA": "USW00012839", # Miami 
    "KMSP": "USW00014922", # Minneapolis 
    "KOKC": "USW00013967", # Oklahoma City 
    "KBNA": "USW00013897", # Nashville 
    "KJFK": "USW00094789", # New York 
    "KPHX": "USW00023183", # Phoenix 
    "KPWM": "USW00014764", # Portland ME
    "KPDX": "USW00024229", # Portland OR 
    "KSLC": "USW00024127", # Salt Lake City
    "KSAN": "USW00023188", # San Diego 
    "KSFO": "USW00023234", # San Francisco 
    "KSEA": "USW00024233", # Seattle 
    "KDCA": "USW00013743", # Washington DC
}

In [3]:
data_path_url = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/all/"

# Directory to save downloaded files
raw_noaa_cache = "data/noaa"

# Ensure the directory exists
os.makedirs(raw_noaa_cache, exist_ok=True)

# URL to download data from
data_path_url = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/all/"

# Setup logging
logging.basicConfig(level=logging.INFO)

# Loop through the station codes and download the corresponding files
for station_code, file_name in station_code_dict.items():
    url = f"{data_path_url}{file_name}.dly"
    try:
        # Download the file and save it
        urllib.request.urlretrieve(url, os.path.join(raw_noaa_cache, f"{station_code}.dly"))
        logging.info(f"Successfully scraped data for: {station_code}")
    except Exception as e:
        logging.error(f"Failed to download data for {station_code}: {e}")

INFO:root:Successfully scraped data for: PANC
INFO:root:Successfully scraped data for: KBOI
INFO:root:Successfully scraped data for: KORD
INFO:root:Successfully scraped data for: KDEN
INFO:root:Successfully scraped data for: KDTW
INFO:root:Successfully scraped data for: PHNL
INFO:root:Successfully scraped data for: KIAH
INFO:root:Successfully scraped data for: KMIA
INFO:root:Successfully scraped data for: KMSP
INFO:root:Successfully scraped data for: KOKC
INFO:root:Successfully scraped data for: KBNA
INFO:root:Successfully scraped data for: KJFK
INFO:root:Successfully scraped data for: KPHX
INFO:root:Successfully scraped data for: KPWM
INFO:root:Successfully scraped data for: KPDX
INFO:root:Successfully scraped data for: KSLC
INFO:root:Successfully scraped data for: KSAN
INFO:root:Successfully scraped data for: KSFO
INFO:root:Successfully scraped data for: KSEA
INFO:root:Successfully scraped data for: KDCA


In [6]:
import os
import logging
from data import download_noaa  
import predictor.utils as utils
import pandas as pd


# Ensure the logger is configured
logging.basicConfig(level=logging.INFO)

def read_noaa_data_file(filename, variables=None, include_flags=False, dropna='all'):
    """Reads in all data from a GHCN .dly data file

    :param filename: path to file
    :param variables: list of variables to include in output dataframe
        e.g. ['TMAX', 'TMIN', 'PRCP']
    :param include_flags: Whether to include data quality flags in the final output
    :returns: Pandas dataframe
    """

    df = pd.read_fwf(
        filename,
        colspecs=data_header_col_specs + data_col_specs,
        names=data_header_names + data_col_names,
        index_col=data_header_names,
        dtype=data_header_dtypes
        )

    if variables is not None:
        df = df[df.index.get_level_values('ELEMENT').isin(variables)]

    df.columns = data_replacement_col_names

    if not include_flags:
        df = df.loc[:, ('VALUE', slice(None))]
        df.columns = df.columns.droplevel('VAR_TYPE')

    df = df.stack(level='DAY').unstack(level='ELEMENT')

    if dropna:
        df.replace(-9999.0, pd.np.nan, inplace=True)
        df.dropna(how=dropna, inplace=True)

    # replace the entire index with the date.
    # This loses the station ID index column!
    # This will usuall fail if dropna=False, since months with <31 days
    # still have day=31 columns
    df.index = pd.to_datetime(
        df.index.get_level_values('YEAR') * 10000 +
        df.index.get_level_values('MONTH') * 100 +
        df.index.get_level_values('DAY'),
        format='%Y%m%d')

    return df

In [17]:
cd Weather-Forecast

/Users/kostina/Weather-Forecast


In [20]:
import pandas as pd

# Define the column names and their respective column positions
columns = ['ID', 'YEAR', 'MONTH', 'ELEMENT'] + [f'VALUE{i}' for i in range(1, 32)] + [f'MFLAG{i}' for i in range(1, 32)] + [f'QFLAG{i}' for i in range(1, 32)] + [f'SFLAG{i}' for i in range(1, 32)]

# Define the fixed column positions for each variable
column_positions = [
    (0, 11),  # ID (1-11)
    (11, 15),  # YEAR (12-15)
    (15, 17),  # MONTH (16-17)
    (17, 21)  # ELEMENT (18-21)
] + [(i*5+21, i*5+26) for i in range(31)] * 3  # VALUE1 to VALUE31, MFLAG1 to MFLAG31, QFLAG1 to QFLAG31, SFLAG1 to SFLAG31

# Function to parse a single line of the file
def parse_line(line):
    data = {}
    
    # Extract the values for each column based on their positions
    data['ID'] = line[0:11].strip()
    data['YEAR'] = int(line[11:15].strip())
    data['MONTH'] = int(line[15:17].strip())
    data['ELEMENT'] = line[17:21].strip()

    # Extract VALUE, MFLAG, QFLAG, SFLAG columns
    for i in range(31):
        value_str = line[21 + i*5:26 + i*5].strip()
        
        # Try to convert the value to an integer, but handle non-numeric values
        try:
            data[f'VALUE{i+1}'] = int(value_str) if value_str else None  # Assign None if the value is empty
        except ValueError:
            data[f'VALUE{i+1}'] = None  # If there's a conversion error, assign None
        
        # Extract MFLAG, QFLAG, SFLAG (which should be single characters)
        data[f'MFLAG{i+1}'] = line[26 + i*5:27 + i*5].strip()
        data[f'QFLAG{i+1}'] = line[27 + i*5:28 + i*5].strip()
        data[f'SFLAG{i+1}'] = line[28 + i*5:29 + i*5].strip()
    
    return data

# Read the file and parse all lines into a list of dictionaries
file_path = 'data/noaa/KBNA.dly'  # Replace with your .dly file path
records = []
with open(file_path, 'r') as f:
    for line in f:
        record = parse_line(line)
        records.append(record)

# Convert the list of records to a DataFrame
df = pd.DataFrame(records)

# Show the first few rows of the DataFrame
print(df.head())



            ID  YEAR  MONTH ELEMENT  VALUE1 MFLAG1 QFLAG1 SFLAG1  VALUE2  \
0  USW00013897  1948      1    TMAX     183                    0     0.0   
1  USW00013897  1948      1    TMIN     -11                    0     0.0   
2  USW00013897  1948      1    PRCP     678                    0     0.0   
3  USW00013897  1948      1    SNOW       0      T             X     NaN   
4  USW00013897  1948      1    SNWD       0                    X     NaN   

  MFLAG2  ... QFLAG29 SFLAG29  VALUE30 MFLAG30 QFLAG30 SFLAG30  VALUE31  \
0         ...               2     22.0               0              0.0   
1      -  ...       1       7   -178.0               0              NaN   
2         ...                      0.0               0              0.0   
3         ...                      0.0               X              NaN   
4         ...       1       0    102.0               X              NaN   

  MFLAG31 QFLAG31 SFLAG31  
0       7       8          
1       5       6          
2       

In [22]:
df

Unnamed: 0,ID,YEAR,MONTH,ELEMENT,VALUE1,MFLAG1,QFLAG1,SFLAG1,VALUE2,MFLAG2,...,QFLAG29,SFLAG29,VALUE30,MFLAG30,QFLAG30,SFLAG30,VALUE31,MFLAG31,QFLAG31,SFLAG31
0,USW00013897,1948,1,TMAX,183,,,0,0.0,,...,,2,22.0,,0,,0.0,7,8,
1,USW00013897,1948,1,TMIN,-11,,,0,0.0,-,...,1,7,-178.0,,0,,,5,6,
2,USW00013897,1948,1,PRCP,678,,,0,0.0,,...,,,0.0,,0,,0.0,,0,T
3,USW00013897,1948,1,SNOW,0,T,,X,,,...,,,0.0,,X,,,,0,T
4,USW00013897,1948,1,SNWD,0,,,X,,,...,1,0,102.0,,X,,,5,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17420,USW00013897,2024,11,WSF2,98,,,W,,,...,9,9,9999.0,,,-,-99.0,9,9,
17421,USW00013897,2024,11,WSF5,125,,,W,,,...,9,9,9999.0,,,-,-99.0,9,9,
17422,USW00013897,2024,11,WT01,1,,,W,,9,...,9,9,9999.0,,,-,-99.0,9,9,
17423,USW00013897,2024,11,WT03,-9999,,,,-9.0,9,...,9,9,9999.0,,,-,-99.0,9,9,


In [27]:
df.columns

Index(['ID', 'YEAR', 'MONTH', 'ELEMENT', 'VALUE1', 'MFLAG1', 'QFLAG1',
       'SFLAG1', 'VALUE2', 'MFLAG2',
       ...
       'QFLAG29', 'SFLAG29', 'VALUE30', 'MFLAG30', 'QFLAG30', 'SFLAG30',
       'VALUE31', 'MFLAG31', 'QFLAG31', 'SFLAG31'],
      dtype='object', length=128)

In [29]:
for column in df.columns:
    print(column, df[column].isnull().sum())
    

ID 0
YEAR 0
MONTH 0
ELEMENT 0
VALUE1 0
MFLAG1 0
QFLAG1 0
SFLAG1 0
VALUE2 10157
MFLAG2 0
QFLAG2 0
SFLAG2 0
VALUE3 724
MFLAG3 0
QFLAG3 0
SFLAG3 0
VALUE4 11704
MFLAG4 0
QFLAG4 0
SFLAG4 0
VALUE5 16791
MFLAG5 0
QFLAG5 0
SFLAG5 0
VALUE6 735
MFLAG6 0
QFLAG6 0
SFLAG6 0
VALUE7 11209
MFLAG7 0
QFLAG7 0
SFLAG7 0
VALUE8 13328
MFLAG8 0
QFLAG8 0
SFLAG8 0
VALUE9 0
MFLAG9 0
QFLAG9 0
SFLAG9 0
VALUE10 10179
MFLAG10 0
QFLAG10 0
SFLAG10 0
VALUE11 753
MFLAG11 0
QFLAG11 0
SFLAG11 0
VALUE12 11655
MFLAG12 0
QFLAG12 0
SFLAG12 0
VALUE13 16720
MFLAG13 0
QFLAG13 0
SFLAG13 0
VALUE14 732
MFLAG14 0
QFLAG14 0
SFLAG14 0
VALUE15 11231
MFLAG15 0
QFLAG15 0
SFLAG15 0
VALUE16 13435
MFLAG16 0
QFLAG16 0
SFLAG16 0
VALUE17 0
MFLAG17 0
QFLAG17 0
SFLAG17 0
VALUE18 10153
MFLAG18 0
QFLAG18 0
SFLAG18 0
VALUE19 751
MFLAG19 0
QFLAG19 0
SFLAG19 0
VALUE20 11700
MFLAG20 0
QFLAG20 0
SFLAG20 0
VALUE21 16795
MFLAG21 0
QFLAG21 0
SFLAG21 0
VALUE22 758
MFLAG22 0
QFLAG22 0
SFLAG22 0
VALUE23 11261
MFLAG23 0
QFLAG23 0
SFLAG23 0
VALUE24 13362
MFLA

In [None]:
def process_noaa(station):
    try:
        # Construct the path for the raw NOAA data file
        noaa_path = os.path.join(utils.raw_noaa_cache, f"{station}.dly")
        
        # Read the data from the file
        noaa_data = read_noaa_data_file(noaa_path)
        
        # Create the directory for processed data if it doesn't exist
        os.makedirs(utils.processed_noaa_cache, exist_ok=True)
        
        # Construct the output path for the processed data
        noaa_out_path = os.path.join(utils.processed_noaa_cache, f"{station}.csv")
        
        # Save the processed data to a CSV file
        noaa_data.to_csv(noaa_out_path)
        
        # Log the successful processing
        logging.info(f"Processed data for station: {station}")
    
    except Exception as e:
        # Log any errors encountered
        logging.error(f"Failed to process data for {station}: {e}")

# Iterate over the list of stations in utils.stations
for station in utils.stations_list:
    process_noaa(station)
