In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
from zoneinfo import ZoneInfo
from statsmodels.tsa.seasonal import seasonal_decompose

df = pd.read_csv('../data_csv_raw/raw_data.csv')
df.drop_duplicates(inplace=True)
df = df.sort_values(by='dt')
df['utc8'] = pd.to_datetime(df['dt'], unit='s', utc=True).dt.tz_convert('Asia/Kuala_Lumpur')
# df['year']       = df['utc8'].dt.year
df['month']      = df['utc8'].dt.month
# df['day']        = df['utc8'].dt.day
df['hour']       = df['utc8'].dt.hour
df['hour_stan'] = (df['hour'] - df['hour'].min()) / (df['hour'].max() - df['hour'].min())
# df['minute']     = df['utc8'].dt.minute
# df['second']     = df['utc8'].dt.second
# df['date']       = df['utc8'].dt.date
# df['time']       = df['utc8'].dt.time
# df['weekday']    = df['utc8'].dt.day_name()     # e.g., 'Monday'
df['week'] = df['utc8'].dt.isocalendar().week
df['week'] = df['week']/52
df['day_of_year'] = df['utc8'].dt.dayofyear
df['day_of_year'] = df['day_of_year']/365
df['quarter']    = df['utc8'].dt.quarter
df['quarter'] = df['quarter'] /4

conditions = [
    df['dt'] <= df['sunrise'],
    (df['dt'] > df['sunrise']) & (df['dt'] <= df['sunset'])
]
choices = [0, 1]
df['wind_deg'] = df['wind_deg']/360
df['sunrise_sunset'] = np.select(conditions, choices, default=2)
df['sunrise_sunset'] = df['sunrise_sunset']/3
df['sun_on'] = np.select(conditions, choices, default=0)
df['pressure_norm'] = (df['pressure'] - df['pressure'].mean()) / df['pressure'].std()
df['visibility'] = (df['visibility'] - df['visibility'].min()) / (df['visibility'].max() - df['visibility'].min())
df['temp_stan'] = (df['temp'] - df['temp'].min()) / (df['temp'].max() - df['temp'].min())
df['feelslike_stan'] = (df['feels_like'] - df['feels_like'].min()) / (df['feels_like'].max() - df['feels_like'].min())
df['humidity_stan'] = (df['humidity'] - df['humidity'].min()) / (df['humidity'].max() - df['humidity'].min())
df['dewpoint_stan'] = (df['dew_point'] - df['dew_point'].min()) / (df['dew_point'].max() - df['dew_point'].min())

# lag (in hour)
for i in range(6):
    i+=1
    var1 = f'templag_{i}'
    var2 = f'pressurelag_{i}'
    var3 = f'humiditylag_{i}'
    var4 = f'dewpointlag_{i}'
    var5 = f'feelslikelag_{i}'
    df[var1] = df['temp_stan'].shift(i)
    df[var2] = df['pressure_norm'].shift(i)
    df[var3] = df['humidity_stan'].shift(i)
    df[var4] = df['dewpoint_stan'].shift(i)
    df[var5] = df['feelslike_stan'].shift(i)

# moving average
for i in range(1, 11, 2):
    i+=1
    var1 = f'tempMA_{i-1}'
    var2 = f'pressureMA_{i-1}'
    var3 = f'humidityMA_{i-1}'
    var4 = f'dewpointMA_{i-1}'
    var5 = f'feelslikeMA_{i-1}'
    df[var1] = df['temp_stan'].rolling(window=i).mean()
    df[var2] = df['pressure_norm'].rolling(window=i).mean()
    df[var3] = df['humidity_stan'].rolling(window=i).mean()
    df[var4] = df['dewpoint_stan'].rolling(window=i).mean()
    df[var5] = df['feelslike_stan'].rolling(window=i).mean()

decomp_temp = seasonal_decompose(df['temp_stan'], model='additive', period=24)
decomp_pressure = seasonal_decompose(df['pressure_norm'], model='additive', period=24)
decomp_humidity = seasonal_decompose(df['humidity_stan'], model='additive', period=24)
decomp_dewpoint = seasonal_decompose(df['dewpoint_stan'], model='additive', period=24)
decomp_feelslike = seasonal_decompose(df['feelslike_stan'], model='additive', period=24)

df['temp_trend'] = decomp_temp.trend.ffill().bfill()
df['temp_seasonal'] = decomp_temp.seasonal
df['temp_residual'] = decomp_temp.resid.ffill().bfill()
df['pressure_trend'] = decomp_pressure.trend.ffill().bfill()
df['pressure_seasonal'] = decomp_pressure.seasonal
df['pressure_residual'] = decomp_pressure.resid.ffill().bfill()
df['humidity_trend'] = decomp_humidity.trend.ffill().bfill()
df['humidity_seasonal'] = decomp_humidity.seasonal
df['humidity_residual'] = decomp_humidity.resid.ffill().bfill()
df['dewpoint_trend'] = decomp_dewpoint.trend.ffill().bfill()
df['dewpoint_seasonal'] = decomp_dewpoint.seasonal
df['dewpoint_residual'] = decomp_dewpoint.resid.ffill().bfill()
df['feelslike_trend'] = decomp_feelslike.trend.ffill().bfill()
df['feelslike_seasonal'] = decomp_feelslike.seasonal
df['feelslike_residual'] = decomp_feelslike.resid.ffill().bfill()

df['temp_w_seas'] = df['temp_stan'] * (1 + df['temp_seasonal'])
df['pressure_w_seas'] = df['pressure_norm'] * (1 + df['pressure_seasonal'])
df['humidity_w_seas'] = df['humidity_stan'] * (1 + df['humidity_seasonal'])
df['dewpoint_w_seas'] = df['dewpoint_stan'] * (1 + df['dewpoint_seasonal'])
df['feelslike_w_seas'] = df['feelslike_stan'] * (1 + df['feelslike_seasonal'])

drop_cols = [
    'lat',
    'lon',
    'timezone',
    'timezone_offset',
    'dt',
    'sunrise',
    'sunset',
    'temp', 
    'feels_like',
    'pressure',
    'humidity',
    'dew_point',
    'uvi',
    'utc8',
    'hour',
    'weather_id',
    'weather_icon'
]

df.drop(drop_cols, axis=1, inplace=True)
df = df.iloc[12:].reset_index(drop=True)
df.drop(df[df['weather_main'] == 'Smoke'].index, inplace=True)
df.drop(df[df['weather_main'] == 'Haze'].index, inplace=True)
df.dropna(subset=['visibility'], inplace=True)


df

Unnamed: 0,clouds,visibility,wind_speed,wind_deg,weather_main,weather_desc,month,hour_stan,week,day_of_year,...,dewpoint_seasonal,dewpoint_residual,feelslike_trend,feelslike_seasonal,feelslike_residual,temp_w_seas,pressure_w_seas,humidity_w_seas,dewpoint_w_seas,feelslike_w_seas
0,20,1.000000,1.03,0.000000,Clouds,few clouds,1,0.521739,0.019231,0.002740,...,-0.015395,-0.184974,0.353682,-0.024268,0.089758,0.483269,0.762677,0.297069,0.338854,0.408999
1,20,1.000000,1.54,0.694444,Clouds,few clouds,1,0.565217,0.019231,0.002740,...,-0.007037,-0.135705,0.352867,-0.018232,0.273420,0.644240,0.131792,0.250874,0.399538,0.596968
11,20,1.000000,1.03,0.000000,Clouds,few clouds,1,1.000000,0.019231,0.002740,...,0.000335,-0.025086,0.357440,0.036867,-0.086310,0.291972,0.604112,0.697650,0.516402,0.319353
14,20,1.000000,1.03,0.000000,Clouds,few clouds,1,0.086957,0.019231,0.005479,...,0.004630,0.020036,0.369978,-0.000487,-0.232225,0.260352,0.102981,0.821780,0.563648,0.137199
15,20,1.000000,0.51,0.000000,Rain,light rain,1,0.130435,0.019231,0.005479,...,0.000154,0.029491,0.370403,-0.018298,-0.218810,0.252195,0.106195,0.826264,0.557530,0.130857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3854,20,1.000000,3.09,0.805556,Clouds,few clouds,6,0.782609,0.442308,0.430137,...,0.004630,0.032520,0.311472,-0.000487,0.091350,0.418561,-0.870751,0.570215,0.508267,0.472827
3856,20,1.000000,2.57,0.777778,Clouds,few clouds,6,0.869565,0.442308,0.430137,...,-0.002369,0.032520,0.311472,-0.031074,0.091350,0.310631,-0.379771,0.662080,0.494960,0.333051
3857,20,0.882353,2.06,0.916667,Rain,light rain,6,0.913043,0.442308,0.430137,...,-0.006359,0.032520,0.311472,-0.043271,0.091350,0.308296,-0.390306,0.662751,0.492981,0.328859
3858,20,1.000000,1.03,0.000000,Rain,light rain,6,0.956522,0.442308,0.430137,...,-0.002293,0.032520,0.311472,-0.051876,0.091350,0.275495,0.626065,0.737010,0.516073,0.294709


In [28]:
df.isna().sum()


clouds              0
visibility          0
wind_speed          0
wind_deg            0
weather_id          0
                   ..
temp_w_seas         0
pressure_w_seas     0
humidity_w_seas     0
dewpoint_w_seas     0
feelslike_w_seas    0
Length: 95, dtype: int64

In [3]:
df['weather_main'].unique()

array(['Clouds', 'Rain', 'Thunderstorm', 'Smoke', 'Haze'], dtype=object)

In [4]:
df.drop(df[df['weather_main'] == 'Smoke'].index, inplace=True)
df.drop(df[df['weather_main'] == 'Haze'].index, inplace=True)
df

Unnamed: 0,clouds,visibility,wind_speed,wind_deg,weather_main,weather_desc,month,hour_stan,week,day_of_year,...,dewpoint_seasonal,dewpoint_residual,feelslike_trend,feelslike_seasonal,feelslike_residual,temp_w_seas,pressure_w_seas,humidity_w_seas,dewpoint_w_seas,feelslike_w_seas
0,20,1.000000,1.03,0.000000,Clouds,few clouds,1,0.521739,0.019231,0.002740,...,-0.015395,-0.184974,0.353682,-0.024268,0.089758,0.483269,0.762677,0.297069,0.338854,0.408999
1,20,1.000000,1.54,0.694444,Clouds,few clouds,1,0.565217,0.019231,0.002740,...,-0.007037,-0.135705,0.352867,-0.018232,0.273420,0.644240,0.131792,0.250874,0.399538,0.596968
11,20,1.000000,1.03,0.000000,Clouds,few clouds,1,1.000000,0.019231,0.002740,...,0.000335,-0.025086,0.357440,0.036867,-0.086310,0.291972,0.604112,0.697650,0.516402,0.319353
14,20,1.000000,1.03,0.000000,Clouds,few clouds,1,0.086957,0.019231,0.005479,...,0.004630,0.020036,0.369978,-0.000487,-0.232225,0.260352,0.102981,0.821780,0.563648,0.137199
15,20,1.000000,0.51,0.000000,Rain,light rain,1,0.130435,0.019231,0.005479,...,0.000154,0.029491,0.370403,-0.018298,-0.218810,0.252195,0.106195,0.826264,0.557530,0.130857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3854,20,1.000000,3.09,0.805556,Clouds,few clouds,6,0.782609,0.442308,0.430137,...,0.004630,0.032520,0.311472,-0.000487,0.091350,0.418561,-0.870751,0.570215,0.508267,0.472827
3856,20,1.000000,2.57,0.777778,Clouds,few clouds,6,0.869565,0.442308,0.430137,...,-0.002369,0.032520,0.311472,-0.031074,0.091350,0.310631,-0.379771,0.662080,0.494960,0.333051
3857,20,0.882353,2.06,0.916667,Rain,light rain,6,0.913043,0.442308,0.430137,...,-0.006359,0.032520,0.311472,-0.043271,0.091350,0.308296,-0.390306,0.662751,0.492981,0.328859
3858,20,1.000000,1.03,0.000000,Rain,light rain,6,0.956522,0.442308,0.430137,...,-0.002293,0.032520,0.311472,-0.051876,0.091350,0.275495,0.626065,0.737010,0.516073,0.294709
