In [1]:
import pandas as pd # Load, save and manipulate databases
import matplotlib.pyplot as plt
import numpy as np

from scipy.stats import linregress
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from importlib import reload

import src.globals as g

from src.anomalies import NoiseMachine
from src.spectrum import SpectrumDecomposition

In [2]:
df = pd.read_csv(g.path_tracks_raw)

PrePreProcess

In [3]:
df = df.drop(labels="Unnamed: 0", axis=1)
manipulators = df['manipulator'].apply(lambda x : "1" if "1" in x else ("2" if "2" in x else "?"))
df.insert(0, 'robotid', df['id'] + "-" + manipulators)
df.insert(0, 'seqid', df['robotid'] + "|" + df['date'] + "|" + df['time'])
df = df.drop(labels=["id", "manipulator"], axis=1)

Small fixes, sort it based on seqid&timeindex. Remove known faulty sequences

In [4]:
df = df.sort_values(by=['seqid', 'timeindex'])
df = df[df['seqid'] != '6640-103454-1|2021-06-03|06:16:54']

Processing the input database such that all series start at a timeindex of 0.

In [5]:
df_tmp = df.groupby('seqid').agg(timeindex_min=('timeindex', 'min')).reset_index()
df_tmp = df.merge(df_tmp, on='seqid')
df_tmp['timeindex'] = df_tmp['timeindex'] - df_tmp['timeindex_min']
df_tmp = df_tmp.drop(labels=["timeindex_min"], axis=1)
df = df_tmp.reset_index(drop=True)

Normalizing Direction

Processing the tracks input database such that it always have a negative slope, i.e. from positive mean torqueactual to negative.

In [6]:
# Mark every series that has this problem.
multdf = df.groupby(['seqid']).apply(lambda v: -1 if linregress(v['timeindex'], v['torqueactual'])[0] > 0 else 1).reset_index(name="multiplier")
multdf2 = df.groupby(['seqid']).apply(lambda v: max(v['motorposition']) + min(v['motorposition']) if linregress(v['timeindex'], v['torqueactual'])[0] > 0 else 0).reset_index(name="posfix")

# Merge this database with the original one.
df = df.merge(multdf, on=['seqid'])
df = df.merge(multdf2, on=['seqid'])


In [7]:
# Now multiply it, such that we always start positive and end negative.
df['torqueactual'] = df['torqueactual'] * df['multiplier']
df['speedsetpoint'] = df['speedsetpoint'] * df['multiplier']
df['motorposition'] = abs(df['posfix'] - df['motorposition'])


#drop multiplier column, since it has done its job
df = df.drop(labels=["multiplier", "posfix"], axis=1)

Adding timebins & Interpolate on the timebins

In [8]:
# Using bins instead of more timeindices 
df = df.sort_values(by=['seqid', 'timeindex'])
timeindex_dt = np.median(df['timeindex'].diff().fillna(0))
df['timeindex_bin'] = np.round(df['timeindex'] / timeindex_dt).astype(int)


In [9]:
# Before interpolation, we need to remove motorposition outliers
df['motorposition_diff'] = df['motorposition'].diff()
df.loc[df['motorposition_diff'].abs() > 100, 'motorposition'] = np.nan
df = df.drop(labels=["motorposition_diff"], axis=1)

#Interpolation
def fill_and_interpolate(group):
    columns_intrp = ['timeindex', 'motorposition', 'speedsetpoint', 'torquefeedforward', 'torqueactual']
    columns_fill = ['seqid', 'robotid', 'date', 'time']

    # Add rows that do not have values
    full_range = pd.DataFrame({'timeindex_bin': range(0, group['timeindex_bin'].max() + 1)})
    group = pd.merge(full_range, group, how='left', on=['timeindex_bin'])
    group.sort_values('timeindex_bin')

    # Fill & Interpolate
    group[columns_intrp] = group[columns_intrp].interpolate(method='linear')
    group[columns_fill] = group[columns_fill].ffill().bfill()

    return group

df = df.groupby('seqid').apply(fill_and_interpolate).reset_index(drop=True)
    

In [10]:
# Remove the sequences that have timeindices that are unrealistic.
df_tmp = df.groupby('seqid').agg(robotid=('robotid', 'first'), date=('date', 'first'), time=('time', 'first'), timeindex=('timeindex', 'max')).reset_index()
df_tmp2 = df_tmp.groupby('robotid').agg(timeindex_mean=('timeindex', 'mean'), timeindex_std=('timeindex', 'std')).reset_index()
df_tmp = df_tmp.merge(df_tmp2, on='robotid')
df_tmp['zscore'] = (df_tmp['timeindex'] - df_tmp['timeindex_mean']) / df_tmp['timeindex_std']
df_tmp = df_tmp[(abs(df_tmp['zscore']) >= 1.5) & (df_tmp['timeindex_std'] >= 1)].reset_index()

#Now remove these sequences.
df = df[~df['seqid'].isin(df_tmp['seqid'])].reset_index(drop=True)

In [11]:
# Saving Processed
df.to_parquet(g.path_tracks)

# Upsample

In [12]:
df = pd.read_parquet(g.path_tracks)

In [12]:
def mirror(group):
    mindex = group.index.min()
    midpoint = (len(df) // 2) + mindex

    # Mirror the column around the midpoint
    group.loc[midpoint:, 'torqueactual'] = group.loc[midpoint:, 'torqueactual'].iloc[::-1].values
    group['torqueactual'] = group['torqueactual'] * -1

    return group

df_mirror           = df.copy().groupby('seqid').apply(mirror).reset_index(drop=True)
df_mirror['seqid']  = df_mirror['seqid'] + '|mirrored'
df['seqid']         = df['seqid'] + "|normal"

df_up = pd.concat([df, df_mirror]).reset_index(drop=True)

In [13]:
# Saving up         
df_up.to_parquet(g.path_tracks_up)

# Create Synthethic Database

In [2]:
df_up = pd.read_parquet(g.path_tracks_up)

In [3]:
chunk_size = 100000  # Adjust this size based on your memory limits
seqid_groups = df_up.groupby('seqid')

results = []
for _, group in seqid_groups:
    anomalies = NoiseMachine.generate_anomalies(group)
    results.append(anomalies)
    if len(results) % chunk_size == 0:
        df_syn = pd.concat(results, ignore_index=True)
        # Optionally save or process each chunk if you can't keep all in memory
        results = []  # Clear processed results to free memory

# Concatenate any remaining results
if results:
    df_syn = pd.concat(results, ignore_index=True)



In [6]:
df_syn['seqid'].unique()

array(['6640-101753-1|2021-03-01|10:37:33|mirrored|anom_gaussian',
       '6640-101753-1|2021-03-01|10:37:33|mirrored|anom_point',
       '6640-101753-1|2021-03-01|10:37:33|mirrored|anom_sinus', ...,
       '7600-100498-1|2022-03-02|04:58:34|normal|anom_gaussian',
       '7600-100498-1|2022-03-02|04:58:34|normal|anom_point',
       '7600-100498-1|2022-03-02|04:58:34|normal|anom_sinus'],
      dtype=object)

In [7]:
len(df_syn['seqid'].unique())

16254

In [9]:
chunk_size = 10000  # Adjust based on your system memory
for i in range(0, len(df_syn), chunk_size):
    df_chunk = df_syn.iloc[i:i + chunk_size]
    df_chunk.to_parquet(g.path_tracks_syn, engine='pyarrow', append=True)

TypeError: __cinit__() got an unexpected keyword argument 'append'

# Applying Spectrum Decomposition

In [None]:
sd = SpectrumDecomposition(col_name="torqueactual", n_freq=25)
df_decomposed = sd.transform(df)

# Saving the databases

In [None]:
df_results = df[g.tracks_result_columns]