In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import pyarrow

In [110]:
filepath = 'sample-data/weather_data_10_years.csv'

df = pd.read_csv(filepath)

df.head()


Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1394064000,2014-03-06 00:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,9.79,10000.0,7.22,6.76,...,,,,,,75,803,Clouds,broken clouds,04d
1,1394067600,2014-03-06 01:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,9.75,10000.0,7.01,6.56,...,,,,,,100,804,Clouds,overcast clouds,04d
2,1394071200,2014-03-06 02:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,9.21,10000.0,6.65,6.36,...,,,,,,100,804,Clouds,overcast clouds,04d
3,1394074800,2014-03-06 03:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,8.67,10000.0,6.12,6.08,...,,0.21,,,,100,500,Rain,light rain,10n
4,1394078400,2014-03-06 04:00:00 +0000 UTC,-28800,The University of British Columbia,49.260605,-123.245994,8.0,10000.0,6.14,5.25,...,,0.3,,,,100,500,Rain,light rain,10n


In [111]:
# Specify the columns you want to keep
columns_to_keep = ['dt_iso', 'temp', 'visibility', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'clouds_all', 'rain_1h', 'snow_1h', 'weather_main', 'weather_description']

# Filter the DataFrame to include only the desired columns
df = df[columns_to_keep]

df.head()

Unnamed: 0,dt_iso,temp,visibility,dew_point,feels_like,pressure,humidity,wind_speed,clouds_all,rain_1h,snow_1h,weather_main,weather_description
0,2014-03-06 00:00:00 +0000 UTC,9.79,10000.0,7.22,6.76,1001,84,6.7,75,,,Clouds,broken clouds
1,2014-03-06 01:00:00 +0000 UTC,9.75,10000.0,7.01,6.56,1000,83,7.2,100,,,Clouds,overcast clouds
2,2014-03-06 02:00:00 +0000 UTC,9.21,10000.0,6.65,6.36,998,84,5.66,100,,,Clouds,overcast clouds
3,2014-03-06 03:00:00 +0000 UTC,8.67,10000.0,6.12,6.08,998,84,4.63,100,0.21,,Rain,light rain
4,2014-03-06 04:00:00 +0000 UTC,8.0,10000.0,6.14,5.25,997,88,4.63,100,0.3,,Rain,light rain


In [112]:
# Extract first 10 characters from each value in 'dt_iso' column
df['Timestamp'] = df['dt_iso'].str[:19]
df.drop(['dt_iso'], axis=1, inplace=True)

# Reorganize the order of columns with 'Timestamp' at the beginning
df = df[['Timestamp'] + [col for col in df.columns if col != 'Timestamp']]

print(df)

                 Timestamp  temp  visibility  dew_point  feels_like  pressure  \
0      2014-03-06 00:00:00  9.79     10000.0       7.22        6.76      1001   
1      2014-03-06 01:00:00  9.75     10000.0       7.01        6.56      1000   
2      2014-03-06 02:00:00  9.21     10000.0       6.65        6.36       998   
3      2014-03-06 03:00:00  8.67     10000.0       6.12        6.08       998   
4      2014-03-06 04:00:00  8.00     10000.0       6.14        5.25       997   
...                    ...   ...         ...        ...         ...       ...   
90993  2024-03-05 19:00:00  3.21     10000.0      -0.53        1.22      1017   
90994  2024-03-05 20:00:00  4.18     10000.0      -1.41        0.78      1018   
90995  2024-03-05 21:00:00  4.17     10000.0      -1.06        0.49      1018   
90996  2024-03-05 22:00:00  4.57     10000.0      -1.45        1.26      1018   
90997  2024-03-05 23:00:00  4.68     10000.0      -1.55        2.46      1017   

       humidity  wind_speed

In [113]:
# Combine 'weather_main' and 'weather_description' into a single feature
df['combined_weather'] = df['weather_main'] + "_" + df['weather_description']

# One-hot encoding for the combined feature
one_hot_encoded = pd.get_dummies(df['combined_weather'], prefix='weather')

# Concatenate the one-hot encoded feature with the original DataFrame
df_encoded = pd.concat([df, one_hot_encoded], axis=1)

# Drop the original 'weather_main' and 'weather_description' columns
df_encoded.drop(['weather_main', 'weather_description', 'combined_weather'], axis=1, inplace=True)

print(df_encoded)

                 Timestamp  temp  visibility  dew_point  feels_like  pressure  \
0      2014-03-06 00:00:00  9.79     10000.0       7.22        6.76      1001   
1      2014-03-06 01:00:00  9.75     10000.0       7.01        6.56      1000   
2      2014-03-06 02:00:00  9.21     10000.0       6.65        6.36       998   
3      2014-03-06 03:00:00  8.67     10000.0       6.12        6.08       998   
4      2014-03-06 04:00:00  8.00     10000.0       6.14        5.25       997   
...                    ...   ...         ...        ...         ...       ...   
90993  2024-03-05 19:00:00  3.21     10000.0      -0.53        1.22      1017   
90994  2024-03-05 20:00:00  4.18     10000.0      -1.41        0.78      1018   
90995  2024-03-05 21:00:00  4.17     10000.0      -1.06        0.49      1018   
90996  2024-03-05 22:00:00  4.57     10000.0      -1.45        1.26      1018   
90997  2024-03-05 23:00:00  4.68     10000.0      -1.55        2.46      1017   

       humidity  wind_speed

In [114]:
df_encoded.to_csv('weather_data_10_years_preprocessed.csv', index=False)