# Title Thesis defense

In [42]:
import locale
import os
import re
import time
from typing import List, Tuple

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal
import seaborn as sns

locale.setlocale(locale.LC_ALL, '');

## Loading M4 Data

In [25]:
def read_m4_data(file_paths: List[str], path_prefix: str) -> pd.DataFrame:
    """ reads the  list given file paths and
        and combines them into a singular dataframe"""
    df_all = pd.DataFrame()
    
    for fpath in file_paths:
        start = time.time()
        df_tmp = pd.read_csv(path_prefix+fpath)
        end = time.time()
        execution_time = end - start
        print(f"file: {fpath} read in {execution_time:.2f} seconds\n\
        with {df_tmp.shape[0]:,d} rows and {df_tmp.shape[1]:,d} columns.\n")
        df_all = pd.concat([df_all, df_tmp])
        
    return df_all

In [26]:
# setting file paths
path_prefix_m4 = "/Users/philipp/workspace/UNIC/comp-593/m4_data/"
m4_paths = [
    "Hourly-train.csv",
#     "Daily-train.csv",
    "Weekly-train.csv",
    # "Monthly-train.csv",
    # "Quarterly-train.csv",
    # "Yearly-train.csv"
]

In [27]:
%%time
df_all_m4 = read_m4_data(m4_paths, path_prefix_m4)
print("###################\n")

file: Hourly-train.csv read in 0.11 seconds
        with 414 rows and 961 columns.

file: Weekly-train.csv read in 0.30 seconds
        with 359 rows and 2,598 columns.

###################

CPU times: user 356 ms, sys: 47.9 ms, total: 404 ms
Wall time: 425 ms


## Load UCR archive

In [5]:
def get_ucr_file_paths(path_prefix: str) -> Tuple[List[str],List[str]]:
  """creates a list file paths based on naming
  conventions of UCR archive"""
  ts_train_infos = []
  ts_test_infos = []
  for root, dirs, files in os.walk(path_prefix):
      for name in files:
          if(name.endswith("_TRAIN.tsv")):
              path_tmp = os.path.join(root,name)
              ts_name = re.split("/", root)[-1]
              ts_train_infos.append((ts_name, os.path.join(root,name)))
          elif(name.endswith("_TEST.tsv")):
              path_tmp = os.path.join(root,name)
              ts_name = re.split("/", root)[-1]
              ts_test_infos.append((ts_name, os.path.join(root,name)))
  return ts_train_infos, ts_test_infos

In [6]:
path_prefix_ucr = "/Users/philipp/workspace/UNIC/comp-593/data/ucr_data/UCRArchive_2018"
ts_train_paths, ts_test_paths = get_ucr_file_paths(path_prefix_ucr)

In [8]:
def load_ucr_files(train_paths: List[str], test_paths: List[str]) -> Tuple[pd.DataFrame,pd.DataFrame]:
    """load UCR archive files"""
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()

    for ts_info in tqdm(train_paths):
        ts_name = ts_info[0]
        fp = ts_info[1]

        df_tmp = pd.read_csv(fp, sep='\t', header=None)
        df_tmp['name'] = ts_name
        df_tmp['no'] = df_tmp.index
        cols = df_tmp.columns.tolist()
        cols = cols[-2:] + cols[:-2]
        df_tmp = df_tmp[cols]
        df_train = df_train.append(df_tmp)

    for ts_info in tqdm(test_paths):
        ts_name = ts_info[0]
        fp = ts_info[1]

        df_tmp = pd.read_csv(fp, sep='\t', header=None)
        df_tmp['name'] = ts_name
        df_tmp['no'] = df_tmp.index
        cols = df_tmp.columns.tolist()
        cols = cols[-2:] + cols[:-2]
        df_tmp = df_tmp[cols]
        df_test = df_test.append(df_tmp)
        
    return df_train, df_test
    

In [9]:
df_train, df_test = load_ucr_files(ts_train_paths, ts_test_paths)

  0%|          | 0/143 [00:00<?, ?it/s]

  0%|          | 0/143 [00:00<?, ?it/s]

## Visualize Time Series

In [94]:
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})

## Load FFT conversion results with statistics

In [18]:
# load m4 stats
df_m4_stats = pd.read_csv("../data/df_stats.csv")

# load UCR stats
#df_ucr_stats_train = pd.read_csv("../data/df_ucr_stats_train.csv")
df_ucr_stats_test = pd.read_csv("../data/df_ucr_stats_test.csv")

### Sample Output M4 statsfile
please note identified frequency bands and computed statistics


In [15]:
df_m4_stats[df_m4_stats['ts_name']=='M1']

Unnamed: 0,ts_name,freq_ids,type,m,b,count,mean,std,min,q25,q50,q75,max
104641,M1,"[35, 0, 12, 25, 0]",Welch,0.763289,6125.215235,469.0,6306.247335,1790.611195,2690.0,5000.0,6040.0,7360.0,13490.0
144641,M1,"[409, 198, 136, 409, 0]",fft,0.763289,6125.215235,469.0,6306.247335,1790.611195,2690.0,5000.0,6040.0,7360.0,13490.0
244641,M1,"[409, 198, 409, 136, 0]",Hamming,0.763289,6125.215235,469.0,6306.247335,1790.611195,2690.0,5000.0,6040.0,7360.0,13490.0


### Sample Ouput UCR statsfile

In [19]:
df_ucr_stats_test.head()

Unnamed: 0,ts_name,freq_ids,type,no,class,m,b,count,mean,std,min,q25,q50,q75,max
0,ACSF1,"[340, 388, 370, 340, 388]",Hamming,0.0,9.0,1460,-2.927752e-06,0.002134,-1.541096e-09,1.0,-0.577967,-0.577967,-0.577967,0.012759,1.742434
1,ACSF1,"[340, 388, 370, 340, 388]",Hamming,1.0,9.0,1460,3.756977e-06,-0.002682,5.684931e-10,1.0,-0.59824,-0.588575,-0.588332,0.02748,1.756899
2,ACSF1,"[388, 340, 370, 388, 340]",Hamming,2.0,9.0,1460,-2.508553e-06,0.001803,-2.739726e-10,1.0,-0.58696,-0.582897,-0.582691,0.013297,1.7577
3,ACSF1,"[340, 388, 370, 388, 340]",Hamming,3.0,9.0,1460,-9.97009e-06,0.007221,9.041096e-10,1.0,-0.591978,-0.590736,-0.583757,0.032882,1.746551
4,ACSF1,"[340, 388, 370, 388, 340]",Hamming,4.0,9.0,1460,3.325016e-07,-0.000261,4.315069e-10,1.0,-0.577828,-0.577828,-0.577828,0.008326,1.743008


## Transform Frequencies M4

In [24]:
def get_top_k_freq(PSD: np.array, k: int)->List[int]:
    """ return top k indexes with largest PSD val"""
    PSD = [np.real(val) for val in PSD]
    return sorted(range(len(PSD)), key= lambda x: PSD[x])[-k:]

In [20]:
def get_freq_m4(s: pd.Series, k:int=5) -> List[float]:
    """ compute frequencies for M4 pandas series"""
    df = pd.DataFrame()
    f=np.array(s.iloc[1:].dropna())
    n = f.size
    wdw = np.hamming(n)
    freq = np.arange(n)/n

    # FFT
    fhat = np.fft.fft(f)
    PSD = np.real(fhat * np.conj(fhat) / n)
    top_fft_idx = get_top_k_freq(PSD,k)
    fft_freq = freq[top_fft_idx]
    df_tmp = pd.DataFrame(fft_freq, columns=['val'])
    df_tmp['type']='FFT'
    df = df.append(df_tmp)
    # Hamming
    fhat = np.fft.fft(f*wdw)
    PSD = np.real(fhat * np.conj(fhat) / n)
    freq = np.arange(n)/f.size
    top_ham_idx = get_top_k_freq(PSD,k)
    ham_freq = freq[top_ham_idx]
    df_tmp = pd.DataFrame(ham_freq, columns=['val'])
    df_tmp['type']='Hamming'
    df = df.append(df_tmp)
    # Welch
    seg_length = np.floor(1/20*n)
    if seg_length == 0:
        seg_length=10
    welch_freqs, PSD_welch = signal.welch(f, nperseg=seg_length,
                                      window='hamming')
    top_ham_idx = get_top_k_freq(PSD_welch,k)
    welch_freq = freq[top_ham_idx]
    df_tmp = pd.DataFrame(welch_freq, columns=['val'])
    df_tmp['type']='Welch'
    df = df.append(df_tmp)
    
    return df

## Transform to frequencies UCR

In [21]:
def get_freq_ucr(s: pd.Series, k:int=5) -> List[float]:
    """ compute frequencies for M4 pandas series"""
    df = pd.DataFrame()
    f=np.array(s.iloc[3:].dropna())
    n = f.size
    wdw = np.hamming(n)
    freq = np.arange(n)/n

    # FFT
    fhat = np.fft.fft(f)
    PSD = np.real(fhat * np.conj(fhat) / n)
    top_fft_idx = get_top_k_freq(PSD,k)
    fft_freq = freq[top_fft_idx]
    df_tmp = pd.DataFrame(fft_freq, columns=['val'])
    df_tmp['type']='FFT'
    df = df.append(df_tmp)
    # Hamming
    fhat = np.fft.fft(f*wdw)
    PSD = np.real(fhat * np.conj(fhat) / n)
    freq = np.arange(n)/f.size
    top_ham_idx = get_top_k_freq(PSD,k)
    ham_freq = freq[top_ham_idx]
    df_tmp = pd.DataFrame(ham_freq, columns=['val'])
    df_tmp['type']='Hamming'
    df = df.append(df_tmp)
    # Welch
    seg_length = np.floor(1/20*n)
    if seg_length == 0:
        seg_length=10
    welch_freqs, PSD_welch = signal.welch(f, nperseg=seg_length,
                                      window='hamming')
    top_ham_idx = get_top_k_freq(PSD_welch,k)
    welch_freq = freq[top_ham_idx]
    df_tmp = pd.DataFrame(welch_freq, columns=['val'])
    df_tmp['type']='Welch'
    df = df.append(df_tmp)
    return df

### Test on Single Series

In [28]:
df_all_m4.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V2589,V2590,V2591,V2592,V2593,V2594,V2595,V2596,V2597,V2598
0,H1,605.0,586.0,586.0,559.0,511.0,443.0,422.0,395.0,382.0,...,,,,,,,,,,
1,H2,3124.0,2990.0,2862.0,2809.0,2544.0,2201.0,1996.0,1861.0,1735.0,...,,,,,,,,,,
2,H3,1828.0,1806.0,1897.0,1750.0,1679.0,1620.0,1463.0,1342.0,1192.0,...,,,,,,,,,,
3,H4,6454.0,6324.0,6075.0,5949.0,5858.0,5579.0,5163.0,4790.0,4478.0,...,,,,,,,,,,
4,H5,4263.0,4297.0,4236.0,4080.0,3883.0,3672.0,3248.0,2841.0,2513.0,...,,,,,,,,,,


### set frequency ranges
build a set of frequency ranges $ [10^{-4}, ..., 10^1] $ with a 0.01 step-size

In [205]:
start = -4
stop = 1
steps = 501
freq_ranges = 10**np.linspace(start,stop,steps)

### Convert frequencies to frequency ranges and add statistics

In [219]:
def top_freqs_2_idx(s: pd.DataFrame) -> pd.Series:
    """convert the list of frequencies into order"""
    ar_top_f = np.asarray(s.transpose().iloc[0,:].tolist())
    ar_f_idx = np.digitize(ar_top_f, freq_ranges)
    #print(f"type: {str(s.iloc[0,1])}\nfreqs: {top_f}\n\n")
    df_res = pd.DataFrame.from_dict({"fft_type": [str(s.iloc[0,1])],
                 "top_freq": [ar_f_idx]})
    return df_res

In [None]:
def get_trend(ar:np.ndarray, period:int=12)->Tuple[float,float]
    """ time series decomposition """
    ### cont here

In [228]:
def get_statistics(s:pd.Series)->pd.DataFrame:
    """compute simple statistics on time series"""
    ar = s[1:].to_numpy()
    print(ar)
    count = ar.shape[0]
    mean = np.mean(ar)
    std = np.std(ar)
    min_val = np.min(ar)
    q25 = np.quantile(ar, .25)
    q50 = np.median(ar)
    q75 = np.quantile(ar, .75)
    max_val = np.max(ar)
    
    period = 12
    trend_ar = get_trend(ar, period)
    m, b = fit_trend(trend_ar)
    
    idx = ['ts_name', 'no', 'class', 'm', 'b', 'count', 'mean', 'std', 'min', 'q25', 'q50', 'q75', 'max']
    res = pd.Series([ts_name, no, type_cls, count, m, b, mean, std, min_val, q25, q50, q75, max_val], index=idx)
    return res

In [229]:
df_test = df_all_m4.sample(1).squeeze()
df_test_freq = get_freq_m4(df_test)
print(get_statistics(df_test))
df_test_freq = df_test_freq.groupby(['type']).apply(top_freqs_2_idx)
df_test_freq.head()

[202.0 155.0 103.0 ... nan nan nan]


NameError: name 'get_trend' is not defined