In [83]:
import os
import glob
import pandas as pd

year = 2022

# Get a list of all files in the current directory that start with 'station' and end with '.csv'
data_files = glob.glob(f'{year}/station*.csv')

# Define a function to process a season's data
def process_season(data_files, season_name, months):
    # Initialize an empty DataFrame to store all data for the season
    all_data = pd.DataFrame()

    # Loop over each data file
    for file in data_files:
        # Load the data, convert 'DATE' column to datetime format, and subset for the season's months
        df = pd.read_csv(file, parse_dates=['DATE'])
        df_season = df[df['DATE'].dt.month.isin(months)]

        # Create 'DATE' and 'TIME' columns, and drop unnecessary columns
        df_season = df_season.drop(['VIS', 'HEIGHT_AGL'], axis=1)

        # Append this station's data for the season to the overall DataFrame
        all_data = pd.concat([all_data, df_season])

    return all_data

# Process the data for each season and save to a CSV file
seasons = {
    'Spring': [3, 4, 5],  # March, April, May
    'Summer': [6, 7, 8]   # June, July, August
}

for season_name, months in seasons.items():
    all_data = process_season(data_files, season_name, months)
    all_data.to_csv(f"{season_name}_2022_atmospheric.csv", index=False)


In [84]:
import pandas as pd
from io import StringIO
import glob
import os

# List all the CSV files in a folder (you may need to change the file path)
file_path = "./*.csv"
csv_files = glob.glob(file_path)

# Process each CSV file separately
for file in csv_files:
    with open(file, 'r') as f:
        data = f.read()

    # Read CSV data into a DataFrame
    df = pd.read_csv(StringIO(data), parse_dates=['DATE'])
    
    # Modify DATE values to start from 00:00:00 and end in 00:23:59
    df['DATE'] = df['DATE'].dt.floor('H')
    
    # Group by DATE, LATITUDE, and LONGITUDE, and calculate the mean of the corresponding values
    result = df.groupby(['DATE', 'LATITUDE', 'LONGITUDE']).mean().reset_index()
    
    # Extract the filename without extension from the full path
    filename = os.path.splitext(os.path.basename(file))[0]
    
    # Save the result to a new CSV file with the filename as part of the output path
    output_file_path = f"{filename}_atmospheric_clean.csv"
    result.to_csv(output_file_path, index=False)
    
    print(f"Results for {file} saved to {output_file_path}")
    # Optionally, you can remove the original file after saving the result
    os.remove(file)


Results for .\Spring_2022_atmospheric.csv saved to Spring_2022_atmospheric_atmospheric_clean.csv
Results for .\Summer_2022_atmospheric.csv saved to Summer_2022_atmospheric_atmospheric_clean.csv


# Address missing values

In [85]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
import glob
import os

# Define columns with suspiciously high values
suspicious_columns = ['TMP', 'DEW', 'SLP', 'DIR', 'SPD']

# Define unusually high values for each column
unusually_large_values = {'TMP': 999.9, 'DEW': 999.9, 'SLP': 99999.0, 'DIR': 999.0, 'SPD': 9999.0}

# Create the IterativeImputer
imputer = IterativeImputer()

# Get a list of all CSV files in the directory
csv_files = glob.glob('./*.csv')

for file in csv_files:
    # Load the data
    df = pd.read_csv(file)

    # Convert 'DATE' column to datetime format
    df['DATE'] = pd.to_datetime(df['DATE'])

    # Replace unusually large values with NaN
    for column in suspicious_columns:
        df[column].replace(unusually_large_values[column], float('NaN'), inplace=True)

    # Replace out-of-range wind direction values with NaN
    df['DIR'] = df['DIR'].where(df['DIR'].between(0, 360), float('NaN'))

    # Perform imputation on each column separately
    for column in suspicious_columns:
        df[[column]] = imputer.fit_transform(df[[column]])

    # Limit imputed wind direction values to the range 0-360
    df['DIR'] = np.clip(df['DIR'], 0, 360)

    # Save the imputed dataframe to a new CSV file
    df.to_csv(os.path.join('.', os.path.basename(file)), index=False)