### HUGE NOTE NEED TO INSTALL PANDAS==2.2.3 FOR THIS, THEN NEED TO CHANGE IT BACK TO PANDAS==1.3.5 FOR TORCH GEOMETRIC I WANNA KMS

In [1]:
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import networkx as nx
import pandas as pd
import geodatasets

import geopandas as gpd
from shapely.geometry import Point

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric_temporal.nn.recurrent import A3TGCN2
from torch_geometric_temporal.signal import temporal_signal_split, StaticGraphTemporalSignal

from haversine import haversine
import pickle

# GPU support
DEVICE = torch.device('cuda:3') # cuda
shuffle=True
batch_size = 32

In [2]:
beijing_aqi = pd.read_csv('../../../AirData/AQI/Stations/merged_full.csv')

In [3]:
beijing_aqi.columns

Index(['No', 'year', 'month', 'day', 'hour', 'PM2.5', 'PM10', 'SO2', 'NO2',
       'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station',
       'locationLatitude', 'locationLongitude'],
      dtype='object')

In [4]:
from tqdm import tqdm, trange

class BeijingAirQualityDataset():
    def __init__(self, path, features=None, lat_long_vals=None, t_range=None, time_zone=None,  interval='hour', eq_weights=True):
        self.eq_weights = eq_weights
        self.interval = interval
        
        df = pd.read_csv(path)
        df['time'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
        df['locationLatitude'] = df['locationLatitude'].round(6)
        df['locationLongitude'] = df['locationLongitude'].round(6)
        cols_to_keep = ['time', 'hour', 'locationLatitude', 'locationLongitude'] + list(features.keys())

        if features:
            df = df[cols_to_keep]
        if time_zone:
            df.time = df.time.dt.tz_convert(time_zone)
        if t_range:
            df = df[(df['time'] > t_range[0]) & (df['time'] < t_range[1])]
        if lat_long_vals is not None:
            df = df.merge(lat_long_vals, on=['locationLatitude', 'locationLongitude'])

        fin_df = df.groupby(['locationLatitude', 'locationLongitude', pd.Grouper(key='time', freq='h')]).agg(features).reset_index()
        fin_df['hour'] = df['hour']
        self.locations = fin_df[['locationLatitude', 'locationLongitude']].drop_duplicates()
        self.dataset = fin_df
        # self.length = min(fin_df.groupby(['locationLatitude', 'locationLongitude']).size())
    
    def _get_edge_weights(self):
        num_nodes = len(self.locations)
        self.mapper = {i: tuple(self.locations.iloc[i]) for i in range(num_nodes)}
        self.rev_map = {tuple(self.locations.iloc[i]): i for i in range(num_nodes)}

        edges = []
        weights = []

        for i in range(num_nodes):
            for j in range(i + 1, num_nodes):
                dist = haversine(tuple(self.locations.iloc[i]), tuple(self.locations.iloc[j]))
                if dist != 0:
                    if self.eq_weights:
                        edges.append([i, j])
                        edges.append([j, i]) 
                        weights.append(1)
                        weights.append(1)
                    else:
                        attr = 1 / (1 + haversine(tuple(self.locations.iloc[i]), tuple(self.locations.iloc[j])))
                        edges.append([i, j])
                        edges.append([j, i]) 
                        weights.append(attr)
                        weights.append(attr)

        self.edges = np.array(edges).T
        self.edge_weights = np.array(weights)

    def _get_task(self, start_pred):
        assert start_pred < 24 and start_pred > 1

        fin_df = self.dataset
        self.features = []
        self.targets = []

        date_list = list(pd.unique(fin_df['time'].dt.date))
        tbar = trange(len(date_list), desc='Creating Dataset')
        # features is per day
        for i in tbar:
            day = date_list[i]
            loc_list = []
            day_df = fin_df[fin_df['time'].dt.date == day]
            
            # numpy array per location per hour
            # array shape: len(locs) x features x 24
            for lat, long in self.locations.itertuples(index=False):
                features_list = []
                loc_df = day_df[(day_df['locationLatitude'] == lat) & (day_df['locationLongitude'] == long)]
                if not loc_df.empty:
                    for hr in range(24):
                        vals = loc_df[loc_df['hour'] == hr]
                        if not vals.empty:
                            features_list.append(np.array(vals.iloc[0, 3:-1]))
                        else:
                            features_list.append(np.zeros(len(fin_df.columns[3:-1])))
                    features_list_np = np.stack(features_list).T
                else:
                    features_list_np = np.zeros((len(fin_df.columns[3:-1]), 24))
                    
                loc_list.append(features_list_np)
            
            fin_arr = np.stack(loc_list).astype('float64')
            self.features.append(fin_arr[:, :, :start_pred])
            self.targets.append(fin_arr[:, :, start_pred:])
    
    def get_dataset(self, start_pred=12, set_dict=None):
        if set_dict:
            self.edges = set_dict['edges']
            self.edge_weights = set_dict['edge_weights']
            self.features = set_dict['features']
            self.targets = set_dict['targets']
        else:
            self._get_edge_weights()
            self._get_task(start_pred)
            
        dataset = StaticGraphTemporalSignal(
            self.edges, self.edge_weights, self.features, self.targets
        )

        return dataset

In [5]:
feat = '1,2,3,4'
feat.split(',')

['1', '2', '3', '4']

In [6]:
beijing_aqi


Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station,locationLatitude,locationLongitude
0,1,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin,39.982,116.397
1,2,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin,39.982,116.397
2,3,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin,39.982,116.397
3,4,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin,39.982,116.397
4,5,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin,39.982,116.397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420763,35060,2017,2,28,19,11.0,32.0,3.0,24.0,400.0,72.0,12.5,1013.5,-16.2,0.0,NW,2.4,Wanshouxigong,39.878,116.352
420764,35061,2017,2,28,20,13.0,32.0,3.0,41.0,500.0,50.0,11.6,1013.6,-15.1,0.0,WNW,0.9,Wanshouxigong,39.878,116.352
420765,35062,2017,2,28,21,14.0,28.0,4.0,38.0,500.0,54.0,10.8,1014.2,-13.3,0.0,NW,1.1,Wanshouxigong,39.878,116.352
420766,35063,2017,2,28,22,12.0,23.0,4.0,30.0,400.0,59.0,10.5,1014.4,-12.9,0.0,NNW,1.2,Wanshouxigong,39.878,116.352


In [7]:
features = {
    'PM2.5': 'mean',
    'PM10': 'mean',
    'WSPM': 'mean'
}

beijing_dataset = BeijingAirQualityDataset('../../../AirData/AQI/Stations/merged_full.csv',
                                        features=features, interval='hour', eq_weights=False)
with open('beijingdata.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [9]:
b['targets']

[array([[[  3. ,   3. ,   3. ,  12. ,  11. ,  13. ,  10. ,  10. ,  11. ,
           10. ,  10. ,  15. ],
         [  6. ,   9. ,  14. ,  14. ,  17. ,  18. ,  18. ,  16. ,  12. ,
           12. ,  13. ,  15. ],
         [  1.6,   3.1,   2.1,   2.4,   3.3,   2. ,   2.2,   2.1,   1.9,
            1.1,   2. ,   1.8]],
 
        [[  9. ,  10. ,  10. ,   3. ,   8. ,  10. ,  11. ,  12. ,  12. ,
           12. ,  12. ,  15. ],
         [ 19. ,  34. ,  19. ,   6. ,  22. ,  28. ,  21. ,   4. ,  16. ,
           17. ,  16. ,  17. ],
         [  4.6,   4.5,   4. ,   2.1,   1.8,   3.5,   0.5,   1.1,   1.2,
            1. ,   1.8,   2.1]],
 
        [[  9. ,  10. ,  13. ,  13. ,  11. ,  10. ,  11. ,  11. ,  13. ,
           15. ,  16. ,  16. ],
         [ 10. ,  19. ,  20. ,  16. ,  22. ,  20. ,  20. ,  31. ,  25. ,
           23. ,  28. ,  28. ],
         [  3.2,   2.7,   2.1,   2.7,   1.8,   1. ,   1.6,   1.7,   1.9,
            0.7,   1. ,   1.1]],
 
        [[  7. ,  12. ,  14. ,  12. ,  13. ,  

In [8]:
b['edge_weights']

array([0.17311309, 0.17311309, 0.06296988, 0.06296988, 0.14752851,
       0.14752851, 0.08078282, 0.08078282, 0.07106714, 0.07106714,
       0.07584815, 0.07584815, 0.06979687, 0.06979687, 0.02573946,
       0.02573946, 0.02493801, 0.02493801, 0.02066771, 0.02066771,
       0.01777135, 0.01777135, 0.04931912, 0.04931912, 0.11741444,
       0.11741444, 0.12040039, 0.12040039, 0.10245296, 0.10245296,
       0.0854064 , 0.0854064 , 0.06175788, 0.06175788, 0.0284723 ,
       0.0284723 , 0.0245281 , 0.0245281 , 0.02046415, 0.02046415,
       0.01864995, 0.01864995, 0.06981807, 0.06981807, 0.0403898 ,
       0.0403898 , 0.0375271 , 0.0375271 , 0.04838236, 0.04838236,
       0.07717662, 0.07717662, 0.02101773, 0.02101773, 0.02863791,
       0.02863791, 0.02317867, 0.02317867, 0.01651926, 0.01651926,
       0.08741136, 0.08741136, 0.07502962, 0.07502962, 0.11504441,
       0.11504441, 0.11330906, 0.11330906, 0.02795969, 0.02795969,
       0.0291204 , 0.0291204 , 0.02346695, 0.02346695, 0.01933

In [None]:
'pickle' in 'beijingdata.pickle'

In [None]:
len(pd.unique(beijing_dataset.dataset['time'].dt.date))

In [None]:
beijing_loader = beijing_dataset.get_dataset(12, b)

In [None]:
len(list(beijing_loader))

In [None]:
import sys
sys.getsizeof(beijing_dataset.targets)

In [None]:
import pickle

a = {'features': beijing_dataset.features,
     'targets': beijing_dataset.targets,
     'edges': beijing_dataset.edges,
     'edge_weights': beijing_dataset.edge_weights}

with open('beijingdata.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
str = 'beijingdata.pickle'

In [None]:
with open('beijingdata.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [None]:
aqi_dataset_quality = pd.read_hdf('../../../AirData/AQI/full437.h5', 'pm25')
aqi_dataset_stations = pd.read_hdf('../../../AirData/AQI/full437.h5', 'stations')

In [None]:
aqi_dataset_stations

In [None]:
aqi_dataset_quality

In [None]:
plt.plot(aqi_dataset_quality[1001])

In [None]:
class AirQualityDataset():
    def __init__(self, path, features=None, lat_long_vals=None, t_range=None, time_zone=None,  interval='hour', eq_weights=True):
        self.eq_weights = eq_weights
        self.interval = interval
        
        df = pd.read_csv(path)
        df['time'] = pd.to_datetime(df['time'], utc=True)
        df['locationLatitude'] = df['locationLatitude'].round(6)
        df['locationLongitude'] = df['locationLongitude'].round(6)
        cols_to_keep = ['time', 'locationLatitude', 'locationLongitude'] + list(features.keys())

        if features:
            df = df[cols_to_keep]
        if time_zone:
            df.time = df.time.dt.tz_convert(time_zone)
        if t_range:
            df = df[(df['time'] > t_range[0]) & (df['time'] < t_range[1])]
        if not lat_long_vals.empty:
            df = df.merge(lat_long_vals, on=['locationLatitude', 'locationLongitude'])

        fin_df = df.groupby(['locationLatitude', 'locationLongitude', pd.Grouper(key='time', freq='h')]).agg(features).reset_index()
        fin_df['hour'] = fin_df['time'].dt.hour
        self.locations = fin_df[['locationLatitude', 'locationLongitude']].drop_duplicates()
        self.dataset = fin_df
        # self.length = min(fin_df.groupby(['locationLatitude', 'locationLongitude']).size())
    
    def _get_edge_weights(self):
        num_nodes = len(self.locations)
        self.mapper = {i: tuple(self.locations.iloc[i]) for i in range(num_nodes)}
        self.rev_map = {tuple(self.locations.iloc[i]): i for i in range(num_nodes)}

        edges = []
        weights = []

        for i in range(num_nodes):
            for j in range(i + 1, num_nodes):
                dist = haversine(tuple(self.locations.iloc[i]), tuple(self.locations.iloc[j]))
                if dist != 0:
                    if self.eq_weights:
                        edges.append([i, j])
                        edges.append([j, i]) 
                        weights.append(1)
                        weights.append(1)
                    else:
                        attr = 1 / (1 + haversine(tuple(self.locations.iloc[i]), tuple(self.locations.iloc[j])))
                        edges.append([i, j])
                        edges.append([j, i]) 
                        weights.append(attr)
                        weights.append(attr)

        self.edges = np.array(edges).T
        self.edge_weights = np.array(weights)

    def _get_task(self, start_pred):
        assert start_pred < 24 and start_pred > 1

        fin_df = self.dataset
        self.features = []
        self.targets = []

        # features is per day
        for day in list(pd.unique(fin_df['time'].dt.date)):
            loc_list = []
            day_df = fin_df[fin_df['time'].dt.date == day]
            
            # numpy array per location per hour
            # array shape: len(locs) x features x 24
            for lat, long in self.locations.itertuples(index=False):
                features_list = []
                loc_df = day_df[(day_df['locationLatitude'] == lat) & (day_df['locationLongitude'] == long)]
                if not loc_df.empty:
                    for hr in range(24):
                        vals = loc_df[loc_df['hour'] == hr]
                        if not vals.empty:
                            features_list.append(np.array(vals.iloc[0, 3:-1]))
                        else:
                            features_list.append(np.zeros(len(fin_df.columns[3:-1])))
                    features_list_np = np.stack(features_list).T
                else:
                    features_list_np = np.zeros((len(fin_df.columns[3:-1]), 24))
                    
                loc_list.append(features_list_np)
            
            fin_arr = np.stack(loc_list).astype('float64')
            self.features.append(fin_arr[:, :, :start_pred])
            self.targets.append(fin_arr[:, :, start_pred:])
    
    def get_dataset(self, start_pred=12):
        self._get_edge_weights()
        self._get_task(start_pred)
        dataset = StaticGraphTemporalSignal(
            self.edges, self.edge_weights, self.features, self.targets
        )

        return dataset

In [None]:
niwa_dataset = pd.read_csv('../../../AirData/Niwa/allNIWA_clarity.csv')
print(niwa_dataset.columns)
niwa_dataset

In [None]:
features = {
    'pm10ConcNumIndividual.value': 'mean',     # Sum for value1
    'pm1ConcNumIndividual.value': 'mean',
    'pm2_5ConcNumIndividual.value': 'mean',
    'relHumidInternalIndividual.value': 'mean'
}
lat_long_vals = pd.DataFrame({
    'locationLatitude': [-36.844079, -36.844113, -36.711932, -36.898491, -36.906652, -36.876728],
    'locationLongitude': [174.762123, 174.761371, 174.740808, 174.591428, 174.633079, 174.703081]
})
t_range = ['2022-04-01', '2022-12-01']
time_zone = 'Pacific/Auckland'

In [None]:
df = niwa_dataset
df['time'] = pd.to_datetime(df['time'], utc=True)
df['locationLatitude'] = df['locationLatitude'].round(6)
df['locationLongitude'] = df['locationLongitude'].round(6)

In [None]:
cols_to_keep = ['time', 'locationLatitude', 'locationLongitude'] + list(features.keys())

if features:
    df = df[cols_to_keep]
if time_zone:
    df.time = df.time.dt.tz_convert(time_zone)
if t_range:
    df = df[(df['time'] > t_range[0]) & (df['time'] < t_range[1])]
if lat_long_vals:
    df = df.merge(lat_long_vals, on=['locationLatitude', 'locationLongitude'])

In [None]:
fin_df = df.groupby([pd.Grouper(key='time', freq='h'), 'locationLatitude', 'locationLongitude']).agg(features).reset_index()

In [None]:
fin_df['hour'] = fin_df['time'].dt.hour

In [None]:
locations = fin_df[['locationLatitude', 'locationLongitude']].drop_duplicates()

In [None]:
fin_df

In [None]:
locations

In [None]:
from haversine import haversine

num_nodes = len(locations)

mapper = {i: tuple(locations.iloc[i]) for i in range(num_nodes)}
rev_map = {tuple(locations.iloc[i]): i for i in range(num_nodes)}

edges = []
weights = []

for i in range(num_nodes):
    for j in range(i + 1, num_nodes):
        dist = haversine(tuple(locations.iloc[i]), tuple(locations.iloc[j]))
        if dist != 0:
            attr = 1 / (1 + haversine(tuple(locations.iloc[i]), tuple(locations.iloc[j])))
            edges.append([i, j])
            edges.append([j, i]) 
            weights.append(attr)
            weights.append(attr)

edges = np.array(edges).T
weights = np.array(weights)

In [None]:
fin_df

In [None]:
for lat, long in locations.itertuples(index=False):
    print(lat, long)

In [None]:
fin_df

In [None]:
len(list(pd.unique(fin_df['time'].dt.date)))

In [None]:
len(fin_df.columns[3:-1])

In [None]:
np.unique(fin_df['time'].dt.date, return_counts=True)

In [None]:
temp_df = fin_df.iloc[:, [0, 1, 2, 3]]
temp_df['date'] = temp_df['time'].dt.date

In [None]:
# Get the set of all unique (lat, long) pairs in the dataset
unique_lat_long_pairs = temp_df[['locationLatitude', 'locationLongitude']].drop_duplicates()

# Count unique lat-long entries per day
daily_counts = temp_df.groupby('date')[['locationLatitude', 'locationLongitude']].nunique()

# Filter out days that have all lat-long values
days_to_exclude = daily_counts[
    (daily_counts['locationLatitude'] == unique_lat_long_pairs['locationLongitude'].nunique()) & 
    (daily_counts['locationLatitude'] == unique_lat_long_pairs['locationLongitude'].nunique())
].index

# Keep only the days that do NOT contain all lat-long values
filtered_df = temp_df[temp_df['date'].isin(days_to_exclude)]

print(np.unique(filtered_df['date']))


In [None]:
unique_lat_long_pairs

In [None]:
# CODE FOR RESTRICTIVE FEATURE MATRIX

features = []
targets = []

# features is per day
for day in list(pd.unique(fin_df['time'].dt.date)):
    loc_list = []
    day_df = fin_df[fin_df['time'].dt.date == day]
    
    # numpy array per location per hour
    # array shape: len(locs) x features x 24
    for lat, long in locations.itertuples(index=False):
        features_list = []
        loc_df = day_df[(day_df['locationLatitude'] == lat) & (day_df['locationLongitude'] == long)]
        if not loc_df.empty:
            for hr in range(24):
                vals = loc_df[loc_df['hour'] == hr]
                if not vals.empty:
                    features_list.append(np.array(vals.iloc[0, 3:-1]))
                else:
                    features_list.append(np.zeros(len(fin_df.columns[3:-1])))
            features_list_np = np.stack(features_list).T
        else:
            loc_list = []
            break

        loc_list.append(features_list_np)
    if loc_list:
        print(day)
        features.append(np.stack(loc_list))

In [None]:
start_pred = 12
features = []
targets = []

# features is per day
for day in list(pd.unique(fin_df['time'].dt.date)):
    loc_list = []
    day_df = fin_df[fin_df['time'].dt.date == day]
    
    # numpy array per location per hour
    # array shape: len(locs) x features x 24
    for lat, long in locations.itertuples(index=False):
        features_list = []
        loc_df = day_df[(day_df['locationLatitude'] == lat) & (day_df['locationLongitude'] == long)]
        if not loc_df.empty:
            for hr in range(24):
                vals = loc_df[loc_df['hour'] == hr]
                if not vals.empty:
                    features_list.append(np.array(vals.iloc[0, 3:-1]))
                else:
                    features_list.append(np.zeros(len(fin_df.columns[3:-1])))
            features_list_np = np.stack(features_list).T
        else:
            features_list_np = np.zeros((len(fin_df.columns[3:-1]), 24))
            
        loc_list.append(features_list_np)
    
    fin_arr = np.stack(loc_list)
    features.append(fin_arr[:, :, :start_pred])
    targets.append(fin_arr[:, :, start_pred:])

In [None]:
targets[0].shape

______________________________________

In [None]:
t_range = ['2022-01-01', '2022-12-01']
group_by = 'pm2_5ConcNumIndividual.value'


niwa_dataset['time'] = pd.to_datetime(niwa_dataset['time'], utc=True)
niwa_dataset.time = niwa_dataset.time.dt.tz_convert('Pacific/Auckland')
niwa_dataset = niwa_dataset[(niwa_dataset['time'] > t_range[0]) & (niwa_dataset['time'] < t_range[1])]

lat_long = list(niwa_dataset[['locationLongitude', 'locationLatitude']].drop_duplicates()\
                .itertuples(index=False, name=None))

In [None]:
len(lat_long)

In [None]:
niwa_dataset['locationLatitude'] = niwa_dataset['locationLatitude'].round(6)
niwa_dataset['locationLongitude'] = niwa_dataset['locationLongitude'].round(6)

In [None]:
niwa_dataset

In [None]:
filter_coords = pd.DataFrame({
    'locationLatitude': [-36.844079, -36.844113, -36.711932, -36.898491, -36.906652, -36.876728],
    'locationLongitude': [174.762123, 174.761371, 174.740808, 174.591428, 174.633079, 174.703081]
})

pd.unique(niwa_dataset.merge(filter_coords, on=['locationLatitude', 'locationLongitude'])['locationLatitude'])

In [None]:
lat_long

In [None]:
geodatasets.data

In [None]:
# Create a GeoDataFrame
geometry = [Point(lon, lat) for lon, lat in lat_long]
gdf = gpd.GeoDataFrame(geometry=geometry)

# Load detailed map
gdf_world = gpd.read_file(geodatasets.get_path('naturalearth.land'))

# Plot the map
fig, ax = plt.subplots(figsize=(10, 5))
gdf_world.plot(ax=ax, color='lightgray')  # Plot world map
gdf.plot(ax=ax, color='red', markersize=1)  # Plot points with smaller size

# Zoom into Auckland
ax.set_xlim([166, 179])
ax.set_ylim([-48, -34])

ax.set_xlim([174, 175.6])
ax.set_ylim([-37.1, -36.5])

# Show the plot
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Map Plot with Coordinates")
plt.show()

In [None]:
niwa_dataset[-36.7 < niwa_dataset['locationLatitude']]

In [None]:
auckland_dataset = niwa_dataset[(niwa_dataset['locationLatitude'] < -36.7) & (-37.0 < niwa_dataset['locationLatitude'])]

In [None]:
auck_lat_long = list(auckland_dataset[['locationLatitude', 'locationLongitude']].drop_duplicates()\
                .itertuples(index=False, name=None))

In [None]:
grouped = auckland_dataset.groupby(['locationLatitude', 'locationLongitude']).size().reset_index(name='count')
grouped.sort_values(by=['count'])

In [None]:
min(auckland_dataset.groupby(['locationLatitude', 'locationLongitude']).size())

In [None]:
grouped2 = auckland_dataset.groupby(['locationLatitude', 'locationLongitude']).agg(
    start_date=('time', 'min'),
    end_date=('time', 'max')
).reset_index()

grouped2

In [None]:
auckland_dataset[auckland_dataset['locationLatitude'] == -36.711932]

In [None]:
import seaborn as sns

auckland_dataset['location'] = auckland_dataset.apply(lambda row: f"({row['locationLatitude']}, {row['locationLongitude']})", axis=1)

In [None]:
locations

In [None]:
for i in locations.itertuples(index=False):
    print(i)

In [None]:
locations = auckland_dataset[['locationLatitude', 'locationLongitude']].drop_duplicates()
# pm2_5ConcNumIndividual.value, temperatureInternalIndividual.value
print(locations)

# Create subplots
fig, axes = plt.subplots(len(locations), 1, figsize=(8, len(locations) * 3), sharex=True)

if len(locations) == 1:
    axes = [axes]  # Ensure axes is iterable for a single subplot

# Plot each lat-long pair in a separate subplot
for ax, (lat, lon) in zip(axes, locations.itertuples(index=False)):
    subset = auckland_dataset[(auckland_dataset['locationLatitude'] == lat) & (auckland_dataset['locationLongitude'] == lon)]
    
    ax.plot(subset['time'], subset[group_by], marker='o', linestyle='-')

    # Labels and formatting
    ax.set_title(f'{group_by} at ({lat}, {lon})')
    ax.set_ylabel(f'{group_by}')
    ax.grid(True)

# Formatting for x-axis
plt.xlabel('Date')
plt.xticks(rotation=45)
plt.tight_layout()

# Show plot
plt.show()

In [None]:
auckland_dataset.dtypes

In [None]:
auckland_dataset

In [None]:
auckland_dataset['day_of_year'] = auckland_dataset['time'].dt.hour


In [None]:
niwa_dataset.head()

In [None]:
agg_funcs = {
    'pm10ConcNumIndividual.value': 'mean',     # Sum for value1
    'pm1ConcNumIndividual.value': 'mean',
    'pm2_5ConcNumIndividual.value': 'mean',
    'relHumidInternalIndividual.value': 'mean'
}

test = auckland_dataset.groupby([pd.Grouper(key='time', freq='h'), 'locationLatitude', 'locationLongitude']).agg({'pm2_5ConcNumIndividual.value': 'mean'}).reset_index()

In [None]:
max(test['day_of_year'])

In [None]:
for i in range(min(test['day_of_year']), max(test['day_of_year'])+1):
    pass

print(i)

In [None]:
min_len = min(test.groupby(['locationLatitude', 'locationLongitude']).size())
min_len

In [None]:
test['day_of_year'] = test['time'].dt.day_of_year

In [None]:
np.unique(test['day_of_year'], return_counts = True)

In [None]:
# Group by lat-long and day of the year, then take the average temperature
df_avg = auckland_dataset.groupby(['locationLatitude', 'locationLongitude', 'day_of_year']).agg({group_by: 'mean'}).reset_index()

# Get unique latitude-longitude pairs
# locations = df_avg[['locationLatitude', 'locationLongitude']].drop_duplicates()
# print(locations)

# Create subplots
fig, axes = plt.subplots(len(locations), 1, figsize=(8, len(locations) * 3), sharex=True)

if len(locations) == 1:
    axes = [axes]  # Ensure axes is iterable for a single subplot

# Plot each lat-long pair in a separate subplot
for ax, (lat, lon) in zip(axes, locations.itertuples(index=False)):
    subset = test[(test['locationLatitude'] == lat) & (test['locationLongitude'] == lon)]
    ax.plot(subset['time'], subset[group_by], marker='o', linestyle='-')

    # Labels and formatting
    ax.set_title(f'{group_by} at ({lat}, {lon})')
    ax.set_ylabel(group_by)
    ax.grid(True)

# Formatting for x-axis
plt.xlabel('Hour')
plt.xticks(rotation=45)
plt.tight_layout()

# Show plot
plt.show()