Prepare time series


In [None]:
# install third party libraries

!pip install -q hvplot

In [None]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas

In [None]:
# set up data paths

drive_folder = '/MyDrive/Data/flow-co2/data/'
mount_folder = '/content/drive'
data_folder = mount_folder + drive_folder
print(data_folder)

import_csv = 'values.csv'
import_path = data_folder + import_csv
print(import_path)

In [None]:
# mount data source

from google.colab import drive
drive.mount(mount_folder)

In [None]:
# read the file with the timeseries

df = pd.read_csv(import_path)
df

In [None]:
# convert value to numeric

df["value"] = pd.to_numeric(df["value"], errors="coerce")
df

In [None]:
# convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601')
df

In [None]:
# remove entries with null values

df = df[df["timestamp"].notnull() & df["value"].notnull()]
df

In [None]:
# calculate local time as additional column

df["timestamp_local"] = df["timestamp"].dt.tz_convert('America/Toronto')
df


In [None]:
# move timestamp into an index assumed to have frequency of 1 minute

df["timestamp"] = df["timestamp"].dt.floor(freq="min")
df = df.set_index("timestamp")
df

In [None]:
# remove duplicates

dupl = df.index.duplicated(keep='first')
count = (dupl == True).sum()
print(count)

df = df[~dupl]
df

In [None]:
# ensuring the index has the frequency of 1 minute, pad if necessary

df_non_padded = df.copy()
df = df.asfreq("min", method='pad')  # converts to specified frequency, pads missing values
df_padded = df[~df.index.isin(df_non_padded.index)]
df

In [None]:
# serialize data for further processing
pickle_path = data_folder + 'co2_ts.pkl'
df.to_pickle(pickle_path)

In [None]:
print(pickle_path)