In [2]:
import pytimber
ldb = pytimber.LoggingDB()

Specify the output file and the times you want to download. Timber and pyTimber conflict with regards to the the times, probably because of winter and summertime. If you want to have data stored in Timber from 00:00 to 01:00, you might to request either from 01:00 to 02:00 or even 02:00 to 03:00. We have to account for this shift later on.

In [3]:
# %load ../ionsrcopt/source_features.py
class SourceFeatures(object):
    TIMESTAMP = 'UTC_TIME'
    BIASDISCAQNV = 'IP.NSRCGEN:BIASDISCAQNV'
    GASAQN = 'IP.NSRCGEN:GASAQN'
    GASSASAQN = 'IP.NSRCGEN:GASSASAQN'
    SOLINJ_CURRENT = 'IP.SOLINJ.ACQUISITION:CURRENT'
    SOLCEN_CURRENT = 'IP.SOLCEN.ACQUISITION:CURRENT'
    SOLEXT_CURRENT = 'IP.SOLEXT.ACQUISITION:CURRENT'
    OVEN1AQNP = 'IP.NSRCGEN:OVEN1AQNP'
    OVEN2AQNP = 'IP.NSRCGEN:OVEN2AQNP'
    SOURCEHTAQNI = 'IP.NSRCGEN:SOURCEHTAQNI'
    SOURCEHTAQNV = 'IP.NSRCGEN:SOURCEHTAQNV'
    SAIREM2_FORWARDPOWER = 'IP.SAIREM2:FORWARDPOWER'
    THOMSON_FORWARDPOWER = 'IP.NSRCGEN:RFTHOMSONAQNFWD'
    SPARK_COUNTER = 'IP.NSRCGEN:SPARKS'
    BCT05_CURRENT = 'ITL.BCT05:CURRENT'
    BCT25_CURRENT = 'ITF.BCT25:CURRENT'
    BCT41_CURRENT = 'ITH.BCT41:CURRENT'

Now select all parameters you are interested in.

In [4]:
def get_result(parameters_raw, parameters_scaled, t1, t2):
    print("Loading Data in interval {} to {}".format(t1, t2))
    result = {}

    if parameters_raw:
        result = ldb.get(parameters_raw, t1, t2, unixtime=True)

    for k, v in parameters_scaled.items():
        data = ldb.getScaled(k, t1, t2, scaleAlgorithm=v['scale'], scaleInterval=v['interval'], scaleSize=v['size'], unixtime=True)
        result.update(data)
    
    return result

In [5]:
import pandas as pd
from os import path

def load_existing_data(filename, replace_column):
    if not path.exists(filename):
        print("The file {} does not yet exist, we will create a new one".format(filename))
        return pd.DataFrame(columns=[SourceFeatures.TIMESTAMP])
    
    print("Loading data from {}.".format(filename))
    if replace_column:
        print("We will replace columns that already exist")
    else:
        print("We will only append new columns")
        
    df = pd.read_csv(filename)
    return df

def create_base_df(filename, replace_file, replace_column):
    if replace_file:
        df = pd.DataFrame(columns=[SourceFeatures.TIMESTAMP])
    else:
        df = load_existing_data(filename, replace_column)

    df.set_index(SourceFeatures.TIMESTAMP, inplace = True)
    df.index = pd.to_datetime(df.index).tz_localize('UTC')
    
    return df

In [6]:
def check_duplicate_times(time_series):
    x = time_series.duplicated()
    count = x[x].count()
    if count > 0:
        print("Time duplicates exist!")

def join_result(df, result, replace_column):
    print("Joining together result")
    for parameter, values in result.items():
        print("For column {} {} datapoints exist.".format(parameter, len(values[1])))

        if parameter in df.columns:
            print("Parameter {} is already in the data frame. There it has {} values. In the newly retrieved dataset it has {} values.".format(parameter, df[parameter].count(), len(values[1])))
            if not replace_column:
                print("Skipping.")
                continue
            else:
                print("Removing old column.")
                df = df.drop(parameter, axis=1)
                df = df.dropna(axis=0, how='all')

        df_column = pd.DataFrame(columns=[SourceFeatures.TIMESTAMP, parameter])
        df_column[SourceFeatures.TIMESTAMP] = pd.Series([datetime.fromtimestamp(timestamp, tz=pytz.utc) for timestamp in values[0]])
        check_duplicate_times(df_column[SourceFeatures.TIMESTAMP])
        df_column[parameter] = values[1]

        df_column.set_index(SourceFeatures.TIMESTAMP, inplace = True)
        df_column.dropna(inplace=True)

        df = df.join(df_column, how='outer')

    df = df.reindex(sorted(df.columns), axis=1)
    df.index = df.index.strftime('%Y-%m-%d %H:%M:%S.%f').str[:-3]
    df.index.name = SourceFeatures.TIMESTAMP
    return df

If the timeindex is duplicated, we will only keep the first occurence.

And save the output to the file.

In [7]:
def save_df(df, filename):
    print("Saving result to {}".format(filename))
    df.to_csv(filename)

In [8]:
def get_data(filename, t1, t2, parameters_raw, parameters_scaled, replace_file, replace_column):
    result = get_result(parameters_raw, parameters_scaled, t1, t2)
    df = create_base_df(filename, replace_file, replace_column)
    df = join_result(df, result, replace_column)
    df = df[~df.index.duplicated(keep='first')].copy()
    save_df(df, filename)

In [9]:
parameters_raw = [
        #SourceFeatures.BIASDISCAQNV, 
        #SourceFeatures.GASAQN, 
        #SourceFeatures.OVEN1AQNP,
        #SourceFeatures.OVEN2AQNP,
        #SourceFeatures.SOLINJ_CURRENT,
        #SourceFeatures.SOLCEN_CURRENT,
        #SourceFeatures.SOLEXT_CURRENT,
        #SourceFeatures.SOURCEHTAQNI,
        #SourceFeatures.BCT25_CURRENT,
        #SourceFeatures.BCT41_CURRENT,
        #SourceFeatures.SOURCEHTAQNV,
        #SourceFeatures.BCT05_CURRENT,
        SourceFeatures.SPARK_COUNTER,
]
parameters_scaled = {
        #SourceFeatures.THOMSON_FORWARDPOWER : {'scale' : 'AVG', 'interval' : 'SECOND', 'size' : '10'},
        #SourceFeatures.BCT05_CURRENT : {'scale' : 'AVG', 'interval' : 'MINUTE', 'size' : '2'}
}

In [10]:
import pandas as pd
from datetime import datetime
import pytz

def load_data(filename, year, month, replace_file, replace_column):
    t1 = '{}-{:02d}-01 00:00:00.000'.format(year, month)
    if month == 12:
        month = 0
        year += 1
    
    t2 = '{}-{:02d}-01 00:00:00.000'.format(year, month+1)

    t1 = pytz.utc.localize(datetime.strptime(t1, '%Y-%m-%d %H:%M:%S.%f')).astimezone(tz=None)
    t2 = pytz.utc.localize(datetime.strptime(t2, '%Y-%m-%d %H:%M:%S.%f')).astimezone(tz=None)
    
    get_data(filename, t1, t2, parameters_raw, parameters_scaled, replace_file, replace_column)
    print("Finished download of data {}/{}\n".format(month, year))

In [11]:
output_folder = '../Data_Raw/'

year = 2018
start_month = 'Jan'
end_month = 'Nov'

replace_file = False
replace_column = True

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for m in months[months.index(start_month):months.index(end_month)+1]:
    filename = output_folder + '{}{}.csv'.format(m, year)
    load_data(filename, year, months.index(m)+1, replace_file, replace_column)

Loading Data in interval 2018-01-01 01:00:00+01:00 to 2018-02-01 01:00:00+01:00
Loading data from ../Data_Raw/Jan2018.csv.
We will replace columns that already exist
Joining together result
For column IP.NSRCGEN:SPARKS 742 datapoints exist.
Saving result to ../Data_Raw/Jan2018.csv
Finished download of data 1/2018

Loading Data in interval 2018-02-01 01:00:00+01:00 to 2018-03-01 01:00:00+01:00
Loading data from ../Data_Raw/Feb2018.csv.
We will replace columns that already exist
Joining together result
For column IP.NSRCGEN:SPARKS 618 datapoints exist.
Saving result to ../Data_Raw/Feb2018.csv
Finished download of data 2/2018

Loading Data in interval 2018-03-01 01:00:00+01:00 to 2018-04-01 02:00:00+02:00
Loading data from ../Data_Raw/Mar2018.csv.
We will replace columns that already exist
Joining together result
For column IP.NSRCGEN:SPARKS 1474 datapoints exist.
Saving result to ../Data_Raw/Mar2018.csv
Finished download of data 3/2018

Loading Data in interval 2018-04-01 02:00:00+02:00 