In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import pyarrow

In [22]:
import pandas as pd

# Load the data
filepath1 = './../sample-data/weather_data_10_years.csv'
filepath2 = './../sample-data/latest_weather.csv'
df1 = pd.read_csv(filepath1)
df2 = pd.read_csv(filepath2)

# Convert dt columns to datetime objects
df1['dt'] = pd.to_datetime(df1['dt'], unit='s', utc=True)
df2['dt'] = pd.to_datetime(df2['dt'], unit='s', utc=True)

# Define the cutoff datetime
cutoff_datetime = pd.Timestamp('2024-01-01 00:00:00', tz='UTC')

# Filter df1 to include only rows before the cutoff datetime
df1_filtered = df1[df1['dt'] < cutoff_datetime]

# Concatenate the filtered df1 with df2
df = pd.concat([df1_filtered, df2], ignore_index=True)


# Save the combined dataframe to a new CSV file if needed
df.to_csv('./../sample-data/combined_weather_data.csv', index=False)


In [23]:
# Specify the columns you want to keep
columns_to_keep = ['dt_iso', 'temp', 'visibility', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'clouds_all', 'rain_1h', 'snow_1h']

# Filter the DataFrame to include only the desired columns
df = df[columns_to_keep]

df.tail()

Unnamed: 0,dt_iso,temp,visibility,dew_point,feels_like,pressure,humidity,wind_speed,clouds_all,rain_1h,snow_1h
93934,2024-07-02 19:00:00 +0000 UTC,18.06,10000.0,12.94,17.8,1023,72,4.63,20,,
93935,2024-07-02 20:00:00 +0000 UTC,18.46,10000.0,13.32,18.24,1023,72,6.26,20,,
93936,2024-07-02 21:00:00 +0000 UTC,18.47,10000.0,13.12,18.23,1023,71,5.36,20,,
93937,2024-07-02 22:00:00 +0000 UTC,19.13,10000.0,12.63,18.82,1023,66,6.17,20,,
93938,2024-07-02 23:00:00 +0000 UTC,19.62,10000.0,11.65,19.2,1022,60,4.12,20,,


In [24]:
# Extract first 10 characters from each value in 'dt_iso' column
df['Timestamp'] = df['dt_iso'].str[:19]
df.drop(['dt_iso'], axis=1, inplace=True)

# Reorganize the order of columns with 'Timestamp' at the beginning
df = df[['Timestamp'] + [col for col in df.columns if col != 'Timestamp']]

print(df)

                 Timestamp   temp  visibility  dew_point  feels_like  \
0      2014-03-06 00:00:00   9.79     10000.0       7.22        6.76   
1      2014-03-06 01:00:00   9.75     10000.0       7.01        6.56   
2      2014-03-06 02:00:00   9.21     10000.0       6.65        6.36   
3      2014-03-06 03:00:00   8.67     10000.0       6.12        6.08   
4      2014-03-06 04:00:00   8.00     10000.0       6.14        5.25   
...                    ...    ...         ...        ...         ...   
93934  2024-07-02 19:00:00  18.06     10000.0      12.94       17.80   
93935  2024-07-02 20:00:00  18.46     10000.0      13.32       18.24   
93936  2024-07-02 21:00:00  18.47     10000.0      13.12       18.23   
93937  2024-07-02 22:00:00  19.13     10000.0      12.63       18.82   
93938  2024-07-02 23:00:00  19.62     10000.0      11.65       19.20   

       pressure  humidity  wind_speed  clouds_all  rain_1h  snow_1h  
0          1001        84        6.70          75      NaN      N

In [25]:
df.to_csv('./../sample-data/weather_data_10_years_preprocessed.csv', index=False)

In [26]:

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
# Extract date for grouping
df['Timestamp'] = df['Timestamp'].dt.date
# Define aggregation functions
aggregations = {
    'temp': ['min', 'max', 'mean'],
    'visibility': ['min', 'max', 'mean'],
    'dew_point': ['min', 'max', 'mean'],
    'feels_like': ['min', 'max', 'mean'],
    'pressure': ['min', 'max', 'mean'],
    'humidity': ['min', 'max', 'mean'],
    'wind_speed': ['min', 'max', 'mean'],
    'clouds_all': ['min', 'max', 'mean'],
    'rain_1h': ['min', 'max', 'mean'],
    'snow_1h': ['min', 'max', 'mean']
}

# Group by 'date' and aggregate
daily_stats = df.groupby('Timestamp').agg(aggregations)

# Flatten column names for easier access
daily_stats.columns = ['_'.join(col).strip() for col in daily_stats.columns.values]

# Simplify column names
daily_stats.columns = [col.replace('_min', '_min').replace('_max', '_max').replace('_mean', '_avg') for col in daily_stats.columns]

# Reset index to make 'date' a column again
daily_stats = daily_stats.reset_index()

# Display the resulting DataFrame
print(daily_stats)

       Timestamp  temp_min  temp_max   temp_avg  visibility_min  \
0     2014-03-06      7.38      9.79   8.595556          6437.0   
1     2014-03-07      6.79      9.52   8.121667         10000.0   
2     2014-03-08      7.14      9.21   8.134231          9656.0   
3     2014-03-09      6.48     12.00   8.251481          8047.0   
4     2014-03-10      4.91     11.65   7.250000         10000.0   
...          ...       ...       ...        ...             ...   
3767  2024-06-28     11.38     18.45  13.705172          3219.0   
3768  2024-06-29     14.71     19.82  16.976667         10000.0   
3769  2024-06-30     15.13     22.68  17.293750         10000.0   
3770  2024-07-01     14.55     21.93  17.772500         10000.0   
3771  2024-07-02     12.88     20.89  16.666667         10000.0   

      visibility_max  visibility_avg  dew_point_min  dew_point_max  \
0            10000.0     9685.111111           6.12           8.03   
1            10000.0    10000.000000           5.27    

In [27]:


# Create a DataFrame with hourly intervals for each day
start_date = pd.to_datetime(daily_stats['Timestamp'].min())
end_date = pd.to_datetime(daily_stats['Timestamp'].max()) + pd.Timedelta(days=1)
date_range = pd.date_range(start=start_date, end=end_date, freq='H')[:-1]

# Extract dates from the date range
dates = date_range.date
hours = date_range.hour

# Create a DataFrame with hourly intervals
hourly_intervals = pd.DataFrame({
    'Date': dates,
    'Hour': hours
})

# Merge the hourly intervals DataFrame with the daily_stats DataFrame
hourly_stats = pd.merge(hourly_intervals, daily_stats, how='left', left_on='Date', right_on='Timestamp')

# Forward fill the values to propagate daily values to each hourly interval
hourly_stats = hourly_stats.ffill()

# Drop the redundant 'Timestamp' column
hourly_stats = hourly_stats.drop(columns=['Timestamp'])

# Combine 'Date' and 'Hour' into a single 'Timestamp' column
hourly_stats['Timestamp'] = pd.to_datetime(hourly_stats['Date'].astype(str) + ' ' + hourly_stats['Hour'].astype(str) + ':00:00')

# Drop the 'Date' and 'Hour' columns
hourly_stats = hourly_stats.drop(columns=['Date', 'Hour'])

# Reorder columns to have 'Timestamp' as the first column
cols = hourly_stats.columns.tolist()
cols.insert(0, cols.pop(cols.index('Timestamp')))
hourly_stats = hourly_stats[cols]

# Display the resulting DataFrame
print(hourly_stats)


  date_range = pd.date_range(start=start_date, end=end_date, freq='H')[:-1]


                Timestamp  temp_min  temp_max   temp_avg  visibility_min  \
0     2014-03-06 00:00:00      7.38      9.79   8.595556          6437.0   
1     2014-03-06 01:00:00      7.38      9.79   8.595556          6437.0   
2     2014-03-06 02:00:00      7.38      9.79   8.595556          6437.0   
3     2014-03-06 03:00:00      7.38      9.79   8.595556          6437.0   
4     2014-03-06 04:00:00      7.38      9.79   8.595556          6437.0   
...                   ...       ...       ...        ...             ...   
90523 2024-07-02 19:00:00     12.88     20.89  16.666667         10000.0   
90524 2024-07-02 20:00:00     12.88     20.89  16.666667         10000.0   
90525 2024-07-02 21:00:00     12.88     20.89  16.666667         10000.0   
90526 2024-07-02 22:00:00     12.88     20.89  16.666667         10000.0   
90527 2024-07-02 23:00:00     12.88     20.89  16.666667         10000.0   

       visibility_max  visibility_avg  dew_point_min  dew_point_max  \
0             10

In [28]:
# ignoring one hot encoding since they have little/no effect on model
"""
# Combine 'weather_main' and 'weather_description' into a single feature
df['combined_weather'] = df['weather_main'] + "_" + df['weather_description']

# One-hot encoding for the combined feature
one_hot_encoded = pd.get_dummies(df['combined_weather'], prefix='weather')

# Concatenate the one-hot encoded feature with the original DataFrame
df_encoded = pd.concat([df, one_hot_encoded], axis=1)

# Drop the original 'weather_main' and 'weather_description' columns
df_encoded.drop(['weather_main', 'weather_description', 'combined_weather'], axis=1, inplace=True)

print(df_encoded)
"""

#df.drop(['weather_main', 'weather_description'], axis=1, inplace=True)

'\n# Combine \'weather_main\' and \'weather_description\' into a single feature\ndf[\'combined_weather\'] = df[\'weather_main\'] + "_" + df[\'weather_description\']\n\n# One-hot encoding for the combined feature\none_hot_encoded = pd.get_dummies(df[\'combined_weather\'], prefix=\'weather\')\n\n# Concatenate the one-hot encoded feature with the original DataFrame\ndf_encoded = pd.concat([df, one_hot_encoded], axis=1)\n\n# Drop the original \'weather_main\' and \'weather_description\' columns\ndf_encoded.drop([\'weather_main\', \'weather_description\', \'combined_weather\'], axis=1, inplace=True)\n\nprint(df_encoded)\n'

In [29]:
# hourly_stats

In [30]:
#hourly_stats.to_csv('./../sample-data/weather_data_10_years_preprocessed.csv', index=False)