# Weather Prediction 

Authors: Feifan Jiang, Liza Kostina, Judy Wu, Daniel Zou

Date: November 2024

## Objective: 
The objective of this project is to accurately predict the minimum, average, and maximum daily temperatures (in Fahrenheit) for each of 20 selected cities in the United States, over five future days, across nine consecutive days. These predictions aim to minimize the mean squared error and are submitted daily at noon, starting November 26, 2024, and concluding on December 4, 2024. A total of 2,700 temperature predictions will be made, considering 3 temperature variables, 20 cities, 5 prediction days, and 9 submission days. The project focuses on implementing robust forecasting techniques to achieve precision across diverse geographic locations, from Anchorage to Miami, and varying climates.


## Project timeline

* Monday, Nov 25: Model finalized, code committed to Github, Docker image uploaded to Dockerhub
* Tuesday, Nov 26: Begin making daily predictions, due at noon daily
* Tuesday, Dec 3: Presentations
* Wednesday, Dec 4: Final day making predictions; report due

## Data

### Collection and Preprocessing

To utilize the strengths of publicly available datasets, we sourced data from NOAA and an additional dataset, leveraging them as follows:

NOAA: Used to obtain historical weather data dating back to 1960.
???????: Utilized for acquiring recent data. While not restricted by timespan in theory, it was only used to analyze the final year in the presented models.

Acquiring data from NOAA was straightforward after converting airport codes to NOAA database codes, using a script detailed in the appendix. The downloaded data is stored in the data/original folder. For further analysis, this data was converted to .csv format, with the processed files available in the data/processed folder.



## Models

## Conclusion 

## Appendix

### Further work 

### Code

#### NOAA 

In [None]:
import os
import urllib.request
import logging

station_code_dict = {
    "PANC": "USW00026451", # Anchorage 
    "KBOI": "USW00024131", # Boise  
    "KORD": "USW00094846", # Chicago
    "KDEN": "USW00003017", # Denver 
    "KDTW": "USW00094847", # Detroit
    "PHNL": "USW00022521", # Honolulu 
    "KIAH": "USW00012960", # Houston
    "KMIA": "USW00012839", # Miami 
    "KMSP": "USW00014922", # Minneapolis 
    "KOKC": "USW00013967", # Oklahoma City 
    "KBNA": "USW00013897", # Nashville 
    "KJFK": "USW00094789", # New York 
    "KPHX": "USW00023183", # Phoenix 
    "KPWM": "USW00014764", # Portland ME
    "KPDX": "USW00024229", # Portland OR 
    "KSLC": "USW00024127", # Salt Lake City
    "KSAN": "USW00023188", # San Diego 
    "KSFO": "USW00023234", # San Francisco 
    "KSEA": "USW00024233", # Seattle 
    "KDCA": "USW00013743", # Washington DC
}

data_path_url = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/all/"

# Directory to save downloaded files
original_noaa_cache = "data/original"

# Ensure the directory exists
os.makedirs(original_noaa_cache, exist_ok=True)

# URL to download data from
data_path_url = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/all/"

# Setup logging
logging.basicConfig(level=logging.INFO)

# Loop through the station codes and download the corresponding files
for station_code, file_name in station_code_dict.items():
    url = f"{data_path_url}{file_name}.dly"
    try:
        # Download the file and save it
        urllib.request.urlretrieve(url, os.path.join(original_noaa_cache, f"{station_code}.dly"))
        logging.info(f"Successfully scraped data for: {station_code}")
    except Exception as e:
        logging.error(f"Failed to download data for {station_code}: {e}")



In [None]:
import pandas as pd
import os
import pandas as pd

# Define the column names and their respective column positions
columns = ['ID', 'YEAR', 'MONTH', 'ELEMENT'] + [f'VALUE{i}' for i in range(1, 32)] + [f'MFLAG{i}' for i in range(1, 32)] + [f'QFLAG{i}' for i in range(1, 32)] + [f'SFLAG{i}' for i in range(1, 32)]

# Define the fixed column positions for each variable
column_positions = [
    (0, 11),  # ID (1-11)
    (11, 15),  # YEAR (12-15)
    (15, 17),  # MONTH (16-17)
    (17, 21)  # ELEMENT (18-21)
] + [(i*5+21, i*5+26) for i in range(31)] * 3  # VALUE1 to VALUE31, MFLAG1 to MFLAG31, QFLAG1 to QFLAG31, SFLAG1 to SFLAG31


# Function to parse a single line of the file
def parse_line(line):
    data = {}
    
    # Extract the values for each column based on their positions
    data['ID'] = line[0:11].strip()
    data['YEAR'] = int(line[11:15].strip())
    data['MONTH'] = int(line[15:17].strip())
    data['ELEMENT'] = line[17:21].strip()

    # Extract VALUE, MFLAG, QFLAG, SFLAG columns
    for i in range(31):
        start = 21 + i * 8
        value_str = line[start:start + 5].strip()
        try:
            data[f'VALUE{i + 1}'] = int(value_str) if value_str else None
        except ValueError:
            data[f'VALUE{i + 1}'] = None
        data[f'MFLAG{i + 1}'] = line[start + 5:start + 6].strip()
        data[f'QFLAG{i + 1}'] = line[start + 6:start + 7].strip()
        data[f'SFLAG{i + 1}'] = line[start + 7:start + 8].strip()
    
    return data


def convert_dly_to_dataframe(input_dir, output_dir, parse_line, file_extension="csv"):
    """
    Converts all .dly files from the input directory to DataFrames and saves them in the output directory.
    
    Parameters:
        input_dir (str): Path to the directory containing .dly files.
        output_dir (str): Path to the directory where DataFrames will be saved.
        parse_line (function): Function that parses a line in the .dly file and returns a dictionary.
        file_extension (str): Format to save the DataFrame (e.g., 'csv' or 'parquet'). Defaults to 'csv'.
    """
    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist

    for filename in os.listdir(input_dir):
        if filename.endswith(".dly"):
            file_path = os.path.join(input_dir, filename)
            
            # Read and parse the file into a list of dictionaries
            records = []
            with open(file_path, 'r') as f:
                for line in f:
                    record = parse_line(line)
                    records.append(record)
            
            # Convert to DataFrame
            df = pd.DataFrame(records)
            
            # Save the DataFrame
            output_file_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.{file_extension}")
            if file_extension == "csv":
                df.to_csv(output_file_path, index=False)
            elif file_extension == "parquet":
                df.to_parquet(output_file_path, index=False)
            
            print(f"Saved {output_file_path}")


input_dir = 'data/original'
output_dir = 'data/processed'

convert_dly_to_dataframe(input_dir, output_dir, parse_line)

