## Cleaning Weather Dataset 
### Keeping wind, temp, visibility values as a final dataset to merge

In [4]:
import pandas as pd

# Load weather data
df_weather = pd.read_csv("./data/weather_laguardia_airport_2024.csv")

# Parse temperature from TMP
def parse_temperature(tmp_val):
    try:
        temp_str = str(tmp_val).split(',')[0]  # '+0020'
        return int(temp_str) / 10
    except:
        return None

df_weather['temperature_c'] = df_weather['TMP'].apply(parse_temperature)

def parse_wind_speed(wnd_val):
    try:
        # Split the encoded string by commas
        parts = str(wnd_val).split(',')
        # Ensure it has at least 4 parts (SSSS is at index 3)
        if len(parts) >= 4 and parts[3].isdigit():
            return int(parts[3]) / 10  # Convert tenths of m/s to m/s
    except:
        pass
    return None  # Return None if invalid or malformed
    
df_weather['wind_speed_mps'] = df_weather['WND'].apply(parse_wind_speed)

def parse_visibility(vis_val):
    try:
        vis_str = str(vis_val).split(',')[0]  # e.g. '000550'
        if vis_str.isdigit():
            return int(vis_str)  # visibility in meters
    except:
        pass
    return None

df_weather['visibility_km'] = df_weather['VIS'].apply(lambda v: parse_visibility(v) / 1000 if parse_visibility(v) else None)


df_weather.head()

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,GF1,MA1,MW1,MW2,OC1,REM,EQD,temperature_c,wind_speed_mps,visibility_km
0,6806599999,2024-01-01T00:20:00,4,46.966667,8.4,450.0,"BUOCHS AIRPORT STANS, CH",FM-15,99999,V020,...,09991091999000611999999,101301999999,,,,MET058METAR LSZC 010020Z AUTO VRB03KT 0550 FG ...,,2.0,1.5,0.55
1,6806599999,2024-01-01T00:50:00,4,46.966667,8.4,450.0,"BUOCHS AIRPORT STANS, CH",FM-15,99999,V020,...,99999021999036581999999,101301999999,,,,MET059METAR LSZC 010050Z AUTO VRB03KT 1800 BR ...,,1.0,1.5,1.8
2,6806599999,2024-01-01T01:20:00,4,46.966667,8.4,450.0,"BUOCHS AIRPORT STANS, CH",FM-15,99999,V020,...,99999021999003961999999,101301999999,,,,MET064METAR LSZC 010120Z AUTO 29003KT 250V330 ...,,2.0,1.5,7.0
3,6806599999,2024-01-01T01:50:00,4,46.966667,8.4,450.0,"BUOCHS AIRPORT STANS, CH",FM-15,99999,V020,...,99999041999001831999999,101301999999,,,,MET073METAR LSZC 010150Z AUTO VRB02KT 3100 BR ...,,1.0,1.0,3.1
4,6806599999,2024-01-01T02:20:00,4,46.966667,8.4,450.0,"BUOCHS AIRPORT STANS, CH",FM-15,99999,V020,...,99999071999002441999999,101301999999,,,,MET066METAR LSZC 010220Z AUTO VRB02KT 2400 BR ...,,1.0,1.0,2.4


In [11]:
# Extract date and hour for merging
df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
df_weather['date'] = df_weather['DATE'].dt.date
df_weather['hour'] = df_weather['DATE'].dt.hour

# Select relevant columns for cleaned weather CSV
weather_clean = df_weather[['date', 'hour', 'temperature_c', 'wind_speed_mps', 'visibility_km']]

weather_clean = weather_clean.groupby(['date', 'hour'], as_index=False).mean(numeric_only=True)

# Save the cleaned weather data separately
weather_clean.to_csv('./data/cleaned_weather_hourly.csv', index=False)

# Display the first 5 rows of the DataFrame
weather_clean.head()

Unnamed: 0,date,hour,temperature_c,wind_speed_mps,visibility_km
0,2024-01-01,0,1.5,1.5,1.175
1,2024-01-01,1,1.5,1.25,5.05
2,2024-01-01,2,1.5,1.0,3.05
3,2024-01-01,3,2.0,1.55,8.9995
4,2024-01-01,4,2.0,1.55,7.5
