<a href="https://colab.research.google.com/github/olga-terekhova/indoor-co2-forecast/blob/main/notebooks/PrepareTimeSeries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install third party libraries

!pip install -q hvplot

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/175.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.5/175.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas

In [3]:
# set up data paths

drive_folder = '/MyDrive/Data/flow-co2/data/'
mount_folder = '/content/drive'
data_folder = mount_folder + drive_folder
print(data_folder)

import_csv = 'values.csv'
import_path = data_folder + import_csv
print(import_path)

/content/drive/MyDrive/Data/flow-co2/data/
/content/drive/MyDrive/Data/flow-co2/data/values.csv


In [4]:
# mount data source

from google.colab import drive
drive.mount(mount_folder)

Mounted at /content/drive


In [5]:
# read the file with the timeseries

df = pd.read_csv(import_path)
df

Unnamed: 0,value,timestamp
0,537,2025-11-12 21:34:07.273367+00
1,538,2025-11-12 21:33:07.244591+00
2,534,2025-11-12 21:32:07.25786+00
3,535,2025-11-12 21:31:07.253168+00
4,536,2025-11-12 21:29:07.20434+00
...,...,...
49115,576,2025-09-30 03:09:07.955874+00
49116,577,2025-09-30 03:08:08.063467+00
49117,579,2025-09-30 03:07:07.956761+00
49118,578,2025-09-30 03:06:08.091092+00


In [6]:
# convert value to numeric

df["value"] = pd.to_numeric(df["value"], errors="coerce")
df

Unnamed: 0,value,timestamp
0,537.0,2025-11-12 21:34:07.273367+00
1,538.0,2025-11-12 21:33:07.244591+00
2,534.0,2025-11-12 21:32:07.25786+00
3,535.0,2025-11-12 21:31:07.253168+00
4,536.0,2025-11-12 21:29:07.20434+00
...,...,...
49115,576.0,2025-09-30 03:09:07.955874+00
49116,577.0,2025-09-30 03:08:08.063467+00
49117,579.0,2025-09-30 03:07:07.956761+00
49118,578.0,2025-09-30 03:06:08.091092+00


In [7]:
# convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601')
df

Unnamed: 0,value,timestamp
0,537.0,2025-11-12 21:34:07.273367+00:00
1,538.0,2025-11-12 21:33:07.244591+00:00
2,534.0,2025-11-12 21:32:07.257860+00:00
3,535.0,2025-11-12 21:31:07.253168+00:00
4,536.0,2025-11-12 21:29:07.204340+00:00
...,...,...
49115,576.0,2025-09-30 03:09:07.955874+00:00
49116,577.0,2025-09-30 03:08:08.063467+00:00
49117,579.0,2025-09-30 03:07:07.956761+00:00
49118,578.0,2025-09-30 03:06:08.091092+00:00


In [8]:
# remove entries with null values

df = df[df["timestamp"].notnull() & df["value"].notnull()]
df

Unnamed: 0,value,timestamp
0,537.0,2025-11-12 21:34:07.273367+00:00
1,538.0,2025-11-12 21:33:07.244591+00:00
2,534.0,2025-11-12 21:32:07.257860+00:00
3,535.0,2025-11-12 21:31:07.253168+00:00
4,536.0,2025-11-12 21:29:07.204340+00:00
...,...,...
49115,576.0,2025-09-30 03:09:07.955874+00:00
49116,577.0,2025-09-30 03:08:08.063467+00:00
49117,579.0,2025-09-30 03:07:07.956761+00:00
49118,578.0,2025-09-30 03:06:08.091092+00:00


In [9]:
# calculate local time as additional column

df["timestamp_local"] = df["timestamp"].dt.tz_convert('America/Toronto')
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["timestamp_local"] = df["timestamp"].dt.tz_convert('America/Toronto')


Unnamed: 0,value,timestamp,timestamp_local
0,537.0,2025-11-12 21:34:07.273367+00:00,2025-11-12 16:34:07.273367-05:00
1,538.0,2025-11-12 21:33:07.244591+00:00,2025-11-12 16:33:07.244591-05:00
2,534.0,2025-11-12 21:32:07.257860+00:00,2025-11-12 16:32:07.257860-05:00
3,535.0,2025-11-12 21:31:07.253168+00:00,2025-11-12 16:31:07.253168-05:00
4,536.0,2025-11-12 21:29:07.204340+00:00,2025-11-12 16:29:07.204340-05:00
...,...,...,...
49115,576.0,2025-09-30 03:09:07.955874+00:00,2025-09-29 23:09:07.955874-04:00
49116,577.0,2025-09-30 03:08:08.063467+00:00,2025-09-29 23:08:08.063467-04:00
49117,579.0,2025-09-30 03:07:07.956761+00:00,2025-09-29 23:07:07.956761-04:00
49118,578.0,2025-09-30 03:06:08.091092+00:00,2025-09-29 23:06:08.091092-04:00


In [10]:
# move timestamp into an index assumed to have frequency of 1 minute

df["timestamp"] = df["timestamp"].dt.floor(freq="min")
df = df.set_index("timestamp")
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["timestamp"] = df["timestamp"].dt.floor(freq="min")


Unnamed: 0_level_0,value,timestamp_local
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-11-12 21:34:00+00:00,537.0,2025-11-12 16:34:07.273367-05:00
2025-11-12 21:33:00+00:00,538.0,2025-11-12 16:33:07.244591-05:00
2025-11-12 21:32:00+00:00,534.0,2025-11-12 16:32:07.257860-05:00
2025-11-12 21:31:00+00:00,535.0,2025-11-12 16:31:07.253168-05:00
2025-11-12 21:29:00+00:00,536.0,2025-11-12 16:29:07.204340-05:00
...,...,...
2025-09-30 03:09:00+00:00,576.0,2025-09-29 23:09:07.955874-04:00
2025-09-30 03:08:00+00:00,577.0,2025-09-29 23:08:08.063467-04:00
2025-09-30 03:07:00+00:00,579.0,2025-09-29 23:07:07.956761-04:00
2025-09-30 03:06:00+00:00,578.0,2025-09-29 23:06:08.091092-04:00


In [11]:
# remove duplicates

dupl = df.index.duplicated(keep='first')
count = (dupl == True).sum()
print(count)

df = df[~dupl]
df

143


Unnamed: 0_level_0,value,timestamp_local
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-11-12 21:34:00+00:00,537.0,2025-11-12 16:34:07.273367-05:00
2025-11-12 21:33:00+00:00,538.0,2025-11-12 16:33:07.244591-05:00
2025-11-12 21:32:00+00:00,534.0,2025-11-12 16:32:07.257860-05:00
2025-11-12 21:31:00+00:00,535.0,2025-11-12 16:31:07.253168-05:00
2025-11-12 21:29:00+00:00,536.0,2025-11-12 16:29:07.204340-05:00
...,...,...
2025-09-30 03:09:00+00:00,576.0,2025-09-29 23:09:07.955874-04:00
2025-09-30 03:08:00+00:00,577.0,2025-09-29 23:08:08.063467-04:00
2025-09-30 03:07:00+00:00,579.0,2025-09-29 23:07:07.956761-04:00
2025-09-30 03:06:00+00:00,578.0,2025-09-29 23:06:08.091092-04:00


In [12]:
# ensuring the index has the frequency of 1 minute, pad if necessary

df_non_padded = df.copy()
df = df.asfreq("min", method='pad')  # converts to specified frequency, pads missing values
df_padded = df[~df.index.isin(df_non_padded.index)]
df

Unnamed: 0_level_0,value,timestamp_local
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-09-30 03:05:00+00:00,582.0,2025-09-29 23:05:08.056180-04:00
2025-09-30 03:06:00+00:00,578.0,2025-09-29 23:06:08.091092-04:00
2025-09-30 03:07:00+00:00,579.0,2025-09-29 23:07:07.956761-04:00
2025-09-30 03:08:00+00:00,577.0,2025-09-29 23:08:08.063467-04:00
2025-09-30 03:09:00+00:00,576.0,2025-09-29 23:09:07.955874-04:00
...,...,...
2025-11-12 21:30:00+00:00,535.0,2025-11-12 16:31:07.253168-05:00
2025-11-12 21:31:00+00:00,535.0,2025-11-12 16:31:07.253168-05:00
2025-11-12 21:32:00+00:00,534.0,2025-11-12 16:32:07.257860-05:00
2025-11-12 21:33:00+00:00,538.0,2025-11-12 16:33:07.244591-05:00


In [13]:
# serialize data for further processing
pickle_path = data_folder + 'co2_ts.pkl'
df.to_pickle(pickle_path)

In [14]:
print(pickle_path)

/content/drive/MyDrive/Data/flow-co2/data/co2_ts.pkl
