In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import pyarrow

In [33]:
filepath = './../sample-data/weather_data_10_years.csv'
df = pd.read_csv(filepath)


df.head()


Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1394064000,2014-03-06 00:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,9.79,10000.0,7.22,6.76,...,,,,,,75,803,Clouds,broken clouds,04d
1,1394067600,2014-03-06 01:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,9.75,10000.0,7.01,6.56,...,,,,,,100,804,Clouds,overcast clouds,04d
2,1394071200,2014-03-06 02:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,9.21,10000.0,6.65,6.36,...,,,,,,100,804,Clouds,overcast clouds,04d
3,1394074800,2014-03-06 03:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,8.67,10000.0,6.12,6.08,...,,0.21,,,,100,500,Rain,light rain,10n
4,1394078400,2014-03-06 04:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,8.0,10000.0,6.14,5.25,...,,0.3,,,,100,500,Rain,light rain,10n


In [34]:
# Specify the columns you want to keep
columns_to_keep = ['dt_iso', 'temp', 'visibility', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'clouds_all', 'rain_1h', 'snow_1h']

# Filter the DataFrame to include only the desired columns
df = df[columns_to_keep]

df.tail()

Unnamed: 0,dt_iso,temp,visibility,dew_point,feels_like,pressure,humidity,wind_speed,clouds_all,rain_1h,snow_1h
90993,2024-03-05 19:00:00 +0000 UTC,3.21,10000.0,-0.53,1.22,1017,76,2.06,75,,
90994,2024-03-05 20:00:00 +0000 UTC,4.18,10000.0,-1.41,0.78,1018,66,4.12,20,,
90995,2024-03-05 21:00:00 +0000 UTC,4.17,10000.0,-1.06,0.49,1018,68,4.63,20,,
90996,2024-03-05 22:00:00 +0000 UTC,4.57,10000.0,-1.45,1.26,1018,64,4.12,20,,
90997,2024-03-05 23:00:00 +0000 UTC,4.68,10000.0,-1.55,2.46,1017,63,2.57,40,,


In [35]:
# Extract first 10 characters from each value in 'dt_iso' column
df['Timestamp'] = df['dt_iso'].str[:19]
df.drop(['dt_iso'], axis=1, inplace=True)

# Reorganize the order of columns with 'Timestamp' at the beginning
df = df[['Timestamp'] + [col for col in df.columns if col != 'Timestamp']]

print(df)

                 Timestamp  temp  visibility  dew_point  feels_like  pressure  \
0      2014-03-06 00:00:00  9.79     10000.0       7.22        6.76      1001   
1      2014-03-06 01:00:00  9.75     10000.0       7.01        6.56      1000   
2      2014-03-06 02:00:00  9.21     10000.0       6.65        6.36       998   
3      2014-03-06 03:00:00  8.67     10000.0       6.12        6.08       998   
4      2014-03-06 04:00:00  8.00     10000.0       6.14        5.25       997   
...                    ...   ...         ...        ...         ...       ...   
90993  2024-03-05 19:00:00  3.21     10000.0      -0.53        1.22      1017   
90994  2024-03-05 20:00:00  4.18     10000.0      -1.41        0.78      1018   
90995  2024-03-05 21:00:00  4.17     10000.0      -1.06        0.49      1018   
90996  2024-03-05 22:00:00  4.57     10000.0      -1.45        1.26      1018   
90997  2024-03-05 23:00:00  4.68     10000.0      -1.55        2.46      1017   

       humidity  wind_speed

In [44]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
# Extract date for grouping
df['Timestamp'] = df['Timestamp'].dt.date
# Define aggregation functions
aggregations = {
    'temp': ['min', 'max', 'mean'],
    'visibility': ['min', 'max', 'mean'],
    'dew_point': ['min', 'max', 'mean'],
    'feels_like': ['min', 'max', 'mean'],
    'pressure': ['min', 'max', 'mean'],
    'humidity': ['min', 'max', 'mean'],
    'wind_speed': ['min', 'max', 'mean'],
    'clouds_all': ['min', 'max', 'mean'],
    'rain_1h': ['min', 'max', 'mean'],
    'snow_1h': ['min', 'max', 'mean']
}

# Group by 'date' and aggregate
daily_stats = df.groupby('Timestamp').agg(aggregations)

# Flatten column names for easier access
daily_stats.columns = ['_'.join(col).strip() for col in daily_stats.columns.values]

# Simplify column names
daily_stats.columns = [col.replace('_min', '_min').replace('_max', '_max').replace('_mean', '_avg') for col in daily_stats.columns]

# Reset index to make 'date' a column again
daily_stats = daily_stats.reset_index()

# Display the resulting DataFrame
print(daily_stats)

       Timestamp  temp_min  temp_max  temp_avg  visibility_min  \
0     2014-03-06      7.38      9.79  8.595556          6437.0   
1     2014-03-07      6.79      9.52  8.121667         10000.0   
2     2014-03-08      7.14      9.21  8.134231          9656.0   
3     2014-03-09      6.48     12.00  8.251481          8047.0   
4     2014-03-10      4.91     11.65  7.250000         10000.0   
...          ...       ...       ...       ...             ...   
3648  2024-03-01      2.26      5.20  3.387586          4828.0   
3649  2024-03-02      0.39      6.81  2.820769          3219.0   
3650  2024-03-03      2.07      6.49  3.203333          4828.0   
3651  2024-03-04      0.63      4.59  2.117500          2816.0   
3652  2024-03-05     -2.40      4.68  0.887500           805.0   

      visibility_max  visibility_avg  dew_point_min  dew_point_max  \
0            10000.0     9685.111111           6.12           8.03   
1            10000.0    10000.000000           5.27           7.05 

In [37]:
# ignoring one hot encoding since they have little/no effect on model
"""
# Combine 'weather_main' and 'weather_description' into a single feature
df['combined_weather'] = df['weather_main'] + "_" + df['weather_description']

# One-hot encoding for the combined feature
one_hot_encoded = pd.get_dummies(df['combined_weather'], prefix='weather')

# Concatenate the one-hot encoded feature with the original DataFrame
df_encoded = pd.concat([df, one_hot_encoded], axis=1)

# Drop the original 'weather_main' and 'weather_description' columns
df_encoded.drop(['weather_main', 'weather_description', 'combined_weather'], axis=1, inplace=True)

print(df_encoded)
"""

#df.drop(['weather_main', 'weather_description'], axis=1, inplace=True)

'\n# Combine \'weather_main\' and \'weather_description\' into a single feature\ndf[\'combined_weather\'] = df[\'weather_main\'] + "_" + df[\'weather_description\']\n\n# One-hot encoding for the combined feature\none_hot_encoded = pd.get_dummies(df[\'combined_weather\'], prefix=\'weather\')\n\n# Concatenate the one-hot encoded feature with the original DataFrame\ndf_encoded = pd.concat([df, one_hot_encoded], axis=1)\n\n# Drop the original \'weather_main\' and \'weather_description\' columns\ndf_encoded.drop([\'weather_main\', \'weather_description\', \'combined_weather\'], axis=1, inplace=True)\n\nprint(df_encoded)\n'

In [38]:
df

Unnamed: 0,Timestamp,temp,visibility,dew_point,feels_like,pressure,humidity,wind_speed,clouds_all,rain_1h,snow_1h
0,2014-03-06 00:00:00,9.79,10000.0,7.22,6.76,1001,84,6.70,75,,
1,2014-03-06 01:00:00,9.75,10000.0,7.01,6.56,1000,83,7.20,100,,
2,2014-03-06 02:00:00,9.21,10000.0,6.65,6.36,998,84,5.66,100,,
3,2014-03-06 03:00:00,8.67,10000.0,6.12,6.08,998,84,4.63,100,0.21,
4,2014-03-06 04:00:00,8.00,10000.0,6.14,5.25,997,88,4.63,100,0.30,
...,...,...,...,...,...,...,...,...,...,...,...
90993,2024-03-05 19:00:00,3.21,10000.0,-0.53,1.22,1017,76,2.06,75,,
90994,2024-03-05 20:00:00,4.18,10000.0,-1.41,0.78,1018,66,4.12,20,,
90995,2024-03-05 21:00:00,4.17,10000.0,-1.06,0.49,1018,68,4.63,20,,
90996,2024-03-05 22:00:00,4.57,10000.0,-1.45,1.26,1018,64,4.12,20,,


In [39]:
df.to_csv('./../sample-data/weather_data_10_years_preprocessed.csv', index=False)