## Cleaning Weather Dataset 
### Keeping wind, temp, visibility values as a final dataset to merge

In [5]:
import pandas as pd
import re

# Load weather data
df_weather = pd.read_csv("./data/weather_laguardia_airport_2024.csv")

# Extract date and hour for merging

# Select relevant columns for cleaned weather CSV
df_weather = df_weather[['DATE','TMP', 'WND', 'REM', 'VIS']]

df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
df_weather['date'] = df_weather['DATE'].dt.date
df_weather['hour'] = df_weather['DATE'].dt.hour

# Parse temperature from TMP
def parse_temperature(tmp_val):
    try:
        temp_str = str(tmp_val).split(',')[0]  # '+0020'
        return int(temp_str) / 10
    except:
        return None

df_weather['temperature_c'] = df_weather['TMP'].apply(parse_temperature)

def parse_wind_speed(wnd_val):
    try:
        # Split the encoded string by commas
        parts = str(wnd_val).split(',')
        # Ensure it has at least 4 parts (SSSS is at index 3)
        if len(parts) >= 4 and parts[3].isdigit():
            return int(parts[3]) / 10  # Convert tenths of m/s to m/s
    except:
        pass
    return None  # Return None if invalid or malformed
    
df_weather['wind_speed_mps'] = df_weather['WND'].apply(parse_wind_speed)

def parse_visibility(vis_val):
    try:
        vis_str = str(vis_val).split(',')[0]  # e.g. '000550'
        if vis_str.isdigit():
            return int(vis_str)  # visibility in meters
    except:
        pass
    return None

df_weather['visibility_km'] = df_weather['VIS'].apply(lambda v: parse_visibility(v) / 1000 if parse_visibility(v) else None)

# Extract weather events from REM attribute
def extract_weather_events(rem_val):
    try:
        # Split the string by commas and filter out empty strings
        if pd.isna(rem_val):
            return None
        match = re.search(r'(FG|BR|SN|RA|DZ|SG|VCFG|HZ|VCTS|TS|SHRA|CAVOK|OVC|BKN|SCT|NCD|FEW|NSC|TCU)', rem_val)
        return match.group(1)
    except:
        pass
    return None

df_weather['weather_event'] = df_weather['REM'].apply(extract_weather_events)

df_weather.head()

Unnamed: 0,DATE,TMP,WND,REM,VIS,date,hour,temperature_c,wind_speed_mps,visibility_km,weather_event
0,2024-01-01 00:20:00,201,"999,9,V,0015,1",MET058METAR LSZC 010020Z AUTO VRB03KT 0550 FG ...,550199,2024-01-01,0,2.0,1.5,0.55,FG
1,2024-01-01 00:50:00,101,"999,9,V,0015,1",MET059METAR LSZC 010050Z AUTO VRB03KT 1800 BR ...,1800199,2024-01-01,0,1.0,1.5,1.8,BR
2,2024-01-01 01:20:00,201,"290,1,V,0015,1",MET064METAR LSZC 010120Z AUTO 29003KT 250V330 ...,7000199,2024-01-01,1,2.0,1.5,7.0,FEW
3,2024-01-01 01:50:00,101,"999,9,V,0010,1",MET073METAR LSZC 010150Z AUTO VRB02KT 3100 BR ...,3100199,2024-01-01,1,1.0,1.0,3.1,BR
4,2024-01-01 02:20:00,101,"999,9,V,0010,1",MET066METAR LSZC 010220Z AUTO VRB02KT 2400 BR ...,2400199,2024-01-01,2,1.0,1.0,2.4,BR


### Extract weather events:

In [6]:
necessary_columns = ['date', 'hour', 'temperature_c', 'wind_speed_mps', 'visibility_km', 'weather_event']

df_weather = df_weather[necessary_columns]

weather_map = {
    'FG': 'Fog',
    'BR': 'Mist',
    'SN': 'Snow',
    'RA': 'Rain',
    'DZ': 'Drizzle',
    'SG': 'Snow Grains',
    'VCFG': 'Fog Nearby',
    'HZ': 'Haze',
    'VCTS': 'Thunderstorm Nearby',
    'TS': 'Thunderstorm',
    'SHRA': 'Rain Showers',
    'CAVOK': 'Clear',
    'NCD': 'Clear',
    'NSC': 'Clear',
    'FEW': 'Mostly Clear',
    'SCT': 'Partly Cloudy',
    'BKN': 'Cloudy',
    'OVC': 'Overcast',
    'TCU': 'Towering Cumulus (Storm risk)'
}

df_weather['weather_label'] = df_weather['weather_event'].map(weather_map)

### Group data with date & filter data from Jan to July:

In [None]:
df_weather = df_weather.drop(columns=['weather_event'])

df_weather = df_weather.groupby(['date', 'hour'], as_index=False).agg({
    'weather_label': lambda x: list(x.dropna().unique()),
    'temperature_c': 'mean',
    'wind_speed_mps': 'mean',
    'visibility_km': 'mean',
})

df_weather['date'] = pd.to_datetime(df_weather['date'], errors='coerce')
mask = (df_weather['date'] >= '2024-01-01') & (df_weather['date'] <= '2024-07-31')
df_filtered = df_weather.loc[mask]

In [12]:
# df_weather = df_weather.groupby(['date', 'hour'], as_index=False).mean(numeric_only=True)

# Save the cleaned weather data separately
# df_weather.to_csv('./data/cleaned_weather_hourly.csv', index=False)

# Display the first 5 rows of the DataFrame
# df_weather.head()

### Read the cleaned dataset into csv file:

In [14]:
df_filtered.to_csv('./data/cleaned_weather_hourly.csv', index=False)