### Importing necessary libraries

In [1]:
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS
import pandas as pd

from sqlalchemy import create_engine

### Postgres connection (should be extracted from somewhere but since the repo is private, I don't think it matters)

In [2]:
pg_user = 'ruser'
pg_password = 'z4meTOiKGPDq'
pg_hostname = 'primeds.protal.biz:31234'
pg_db_name = 'anomaly_detection'

pg_engine = create_engine(f'postgresql+psycopg2://{pg_user}:{pg_password}@{pg_hostname}/{pg_db_name}')
pg_conn = pg_engine.connect()

### Counting different series in the dataset

In [4]:
series_query = '''SELECT CASE
           WHEN d.parent_id = 5 THEN 'ecommerce'
           WHEN d.parent_id = 19 THEN 'retail'
           ELSE d.series_name
           END AS series_name, f.observation_time AS time, f.actual_value AS value
           FROM mvp.dimseries d INNER JOIN mvp.factobservation f ON f.seriesid = d.seriesid'''
result = pg_conn.execute(series_query)

mapped_df = pd.read_sql(series_query, pg_conn)
mapped_df["series_name"].value_counts()

ecommerce       624385
g               142540
cpu4             17568
rver_res_eth     17568
serv_res_eth     17568
retail            1573
Name: series_name, dtype: int64

### Filtering on the series with the most observations ('ecommerce')

In [5]:
query = '''WITH mapped_table AS (SELECT CASE
           WHEN d.parent_id = 5 THEN 'ecommerce'
           WHEN d.parent_id = 19 THEN 'retail'
           ELSE d.series_name
           END AS series_name, f.observation_time AS time, f.actual_value AS value
           FROM mvp.dimseries d INNER JOIN mvp.factobservation f ON f.seriesid = d.seriesid)
           SELECT * FROM mapped_table
           WHERE mapped_table.series_name = 'ecommerce' '''
pg_conn = pg_engine.connect()
result = pg_conn.execute(query)

df = pd.read_sql(query, pg_conn)
df

Unnamed: 0,series_name,time,value
0,ecommerce,2019-10-01 00:00:00+00,71.0
1,ecommerce,2019-10-01 00:01:00+00,73.0
2,ecommerce,2019-10-01 00:02:00+00,73.0
3,ecommerce,2019-10-01 00:03:00+00,59.0
4,ecommerce,2019-10-01 00:04:00+00,44.0
...,...,...,...
624380,ecommerce,2019-10-31 22:26:00+00,0.0
624381,ecommerce,2019-10-31 22:27:00+00,0.0
624382,ecommerce,2019-10-31 22:28:00+00,0.0
624383,ecommerce,2019-10-31 22:29:00+00,0.0


### InfluxDB connection

In [6]:
bucket = "nikola_test2"
org = "primary"
token = "t0XwAj_0orJy-4ETT9bz8Fi6XdSMDd5CyY_ndt_RD8v2Q5OdX1rk_JJZa7w1p9p0JXAO2ti_mzBhcmCYSexSiA=="
url="http://primeds.protal.biz:30254/"

client = influxdb_client.InfluxDBClient(
    url=url,
    token=token,
    org=org
)

write_api = client.write_api(write_options=SYNCHRONOUS)

### Loading the DataFrame 1 by 1 allows for additional manipulation but slows down the process

In [7]:
for index, row in df.iterrows():
    p = influxdb_client.Point("factobservation").tag("observation", "my_obs").field("value", float(row['value'])).time(row['time'])
    write_api.write(bucket=bucket, org=org, record=p)
client.close()

NOTE: To load this DF this way it will take about 4 hours to complete. I have loaded 1/6 of the data into Influx and it took 44mins. You can check it out in my bucket, it looks nice :D .