In [2]:
import pandas as pd

# Define the condition for filtering (stations you're interested in)
column_name = 'station_number'  # Replace with your column name
# value_conditions = [66062, 66037, 67105]  # List of station numbers to include (Observatory Hill, Sydney Airport, and Richmond)
value_conditions = [66037]  # List of station numbers to include (Sydney Airport)

# Define the columns to read (make sure to include 'station_number' in col_to_include)
col_to_include = ['station_number', 'datetime', 'air_temperature_in_degrees_c', 'wind_speed_in_km_h', 'relative_humidity_in_percentage']

# Specify the chunk size (e.g., 10000 rows per chunk)
chunk_size = 10000

# Initialize an empty list to store filtered chunks
filtered_chunks = []

# Iterate over the file in chunks
for chunk in pd.read_csv(r'c:\Users\z5404477\OneDrive - UNSW\04_Workspace\2. WIP\data\1. raw\BOM Weather Data\bom_data_nsw55167608.csv', 
                         usecols=col_to_include, chunksize=chunk_size):
    # Filter rows where 'station_number' is in the list of value_conditions
    filtered_chunk = chunk[chunk[column_name].isin(value_conditions)]
    filtered_chunks.append(filtered_chunk)

# Concatenate the filtered chunks into a single DataFrame
filtered_df = pd.concat(filtered_chunks)

# Display the first 5 filtered rows
print(filtered_df.head(5))


         station_number             datetime  air_temperature_in_degrees_c  \
1747282           66037  2022-12-01 00:00:00                          18.4   
1747283           66037  2022-12-01 00:30:00                          18.2   
1747284           66037  2022-12-01 01:00:00                          18.3   
1747285           66037  2022-12-01 01:30:00                          18.2   
1747286           66037  2022-12-01 02:00:00                          18.1   

         relative_humidity_in_percentage  wind_speed_in_km_h  
1747282                             63.0                13.0  
1747283                             65.0                13.0  
1747284                             66.0                13.0  
1747285                             68.0                13.0  
1747286                             69.0                13.0  


In [3]:
filtered_df['datetime'] = pd.to_datetime(filtered_df['datetime'], format='%Y-%m-%d %H:%M:%S')
#filter the datetime to include only from 1 July 2021 00:00:00 am to 30 June 2024 23:30:00, which is AEDP data
start_date = '2021-07-01 00:00:00'
end_date = '2024-06-30 23:30:00'
filtered_df = filtered_df[(filtered_df['datetime'] >= start_date) & (filtered_df['datetime'] <= end_date)]

filtered_df_2 = filtered_df[filtered_df['station_number'] == 66037] #use Sydney Airport station only

# make datetime the index
filtered_df_2.set_index('datetime', inplace=True)
#sort the index
filtered_df_2.sort_index(inplace=True)





In [4]:
expected_date_range = pd.date_range(start=start_date, end=end_date, freq='30min')

# make weather data df with the expected date range
weather_data_df = pd.DataFrame(index=expected_date_range)

# set  index name to 'datetime'
weather_data_df.index.name = 'datetime'

# merge the weather data with the filtered data, keeping both datetime from the weather data and the filtered data
weather_data_df = weather_data_df.merge(filtered_df_2, left_index=True, right_index=True, how='outer')

In [5]:
weather_data_df

Unnamed: 0_level_0,station_number,air_temperature_in_degrees_c,relative_humidity_in_percentage,wind_speed_in_km_h
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-01 00:00:00,66037.0,13.7,96.0,13.0
2021-07-01 00:30:00,66037.0,13.2,96.0,9.4
2021-07-01 00:53:00,66037.0,12.8,96.0,13.0
2021-07-01 01:00:00,66037.0,13.3,97.0,11.2
2021-07-01 01:05:00,66037.0,13.3,97.0,11.2
...,...,...,...,...
2024-06-30 21:30:00,66037.0,9.7,85.0,22.3
2024-06-30 22:00:00,66037.0,9.7,82.0,20.5
2024-06-30 22:30:00,66037.0,9.5,82.0,16.6
2024-06-30 23:00:00,66037.0,9.0,82.0,16.6


In [6]:
# fill NaN values the nearest previous value
weather_data_df.fillna(method='ffill', inplace=True)

# include only the row with expected date range
weather_data_df = weather_data_df.loc[expected_date_range]

#delete station_number column
weather_data_df.drop(columns=['station_number'], inplace=True)

  weather_data_df.fillna(method='ffill', inplace=True)


In [11]:
#delete rows with duplicate datetime index
weather_data_df = weather_data_df[~weather_data_df.index.duplicated(keep='first')]

In [12]:
weather_data_df

Unnamed: 0_level_0,air_temperature_in_degrees_c,relative_humidity_in_percentage,wind_speed_in_km_h
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-07-01 00:00:00,13.7,96.0,13.0
2021-07-01 00:30:00,13.2,96.0,9.4
2021-07-01 01:00:00,13.3,97.0,11.2
2021-07-01 01:30:00,13.2,97.0,7.6
2021-07-01 02:00:00,12.9,98.0,7.6
...,...,...,...
2024-06-30 21:30:00,9.7,85.0,22.3
2024-06-30 22:00:00,9.7,82.0,20.5
2024-06-30 22:30:00,9.5,82.0,16.6
2024-06-30 23:00:00,9.0,82.0,16.6


In [13]:
weather_data_df.to_csv(r'c:\Users\z5404477\OneDrive - UNSW\04_Workspace\2. WIP\data\2. processed\aedp_weather_data.csv')


In [8]:
ds3 = pd.read_csv('../../data/ds3_aedp_30min.csv')

In [10]:
ds3


Unnamed: 0,datetime,netload_kW
0,2021-07-01 00:00:00,116.455626
1,2021-07-01 00:30:00,88.273554
2,2021-07-01 01:00:00,65.625582
3,2021-07-01 01:30:00,37.142219
4,2021-07-01 02:00:00,22.757476
...,...,...
52603,2024-06-30 21:30:00,218.549158
52604,2024-06-30 22:00:00,205.936298
52605,2024-06-30 22:30:00,170.634693
52606,2024-06-30 23:00:00,147.510773


In [15]:
# make datetime column the index on ds1 dataframe
ds3.index = ds3['datetime']

ds3.drop(columns=['datetime'], inplace=True)

#convert the index to datetime
ds3.index = pd.to_datetime(ds3.index, format='%Y-%m-%d %H:%M:%S')

#merge the ds3 dataframe with the weather_data_df dataframe
ds5 = ds3.merge(weather_data_df, left_index=True, right_index=True, how='outer')



KeyError: 'datetime'

In [16]:
ds5.to_csv('../../data/ds5_aedp_30min_with_weather.csv')