In [1]:
import math
import os
import re
from typing import List

import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import signal
from tqdm.notebook import tqdm


from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
rc('text', usetex=True)

In [2]:
df_hourly = pd.read_csv("../m4_data/Hourly-train.csv")
print("hourly read: {}".format(df_hourly.shape))
df_daily = pd.read_csv("../m4_data/Daily-train.csv")
print("daily read: {}".format(df_daily.shape))
df_weekly = pd.read_csv("../m4_data/Weekly-train.csv")
print("weekly: {}".format(df_weekly.shape))
df_monthly = pd.read_csv("../m4_data/Monthly-train.csv")
print("monthly: {}".format(df_monthly.shape))
df_quarterly = pd.read_csv("../m4_data/Quarterly-train.csv")
print("quarterly: {}".format(df_quarterly.shape))
df_yearly = pd.read_csv("../m4_data/Yearly-train.csv")
print("yearly: {}".format(df_yearly.shape))

df_m4_raw = pd.concat([df_hourly,
                   df_daily,
                   df_weekly,
                    df_monthly,
                   df_quarterly,
                   df_yearly])

df_hourly = None
df_daily = None
df_weekly = None
df_monthly = None
df_quarterly = None
df_yearly = None

hourly read: (414, 961)
daily read: (4227, 9920)
weekly: (359, 2598)
monthly: (48000, 2795)
quarterly: (24000, 867)
yearly: (23000, 836)


In [3]:
ts_train_infos = []
ts_test_infos = []
for root, dirs, files in os.walk("../data/ucr_data/UCRArchive_2018/"):
    for name in files:
        if(name.endswith("_TRAIN.tsv")):
            path_tmp = os.path.join(root,name)
            ts_name = re.split("/", root)[-1]
            ts_train_infos.append((ts_name, os.path.join(root,name)))
        elif(name.endswith("_TEST.tsv")):
            path_tmp = os.path.join(root,name)
            ts_name = re.split("/", root)[-1]
            ts_test_infos.append((ts_name, os.path.join(root,name)))

In [4]:
df_ucr_train = pd.DataFrame()
#df_ucr_test = pd.DataFrame()

for ts_info in tqdm(ts_train_infos):
    ts_name = ts_info[0]
    fp = ts_info[1]
    
    df_tmp = pd.read_csv(fp, sep='\t', header=None)
    df_tmp['name'] = ts_name
    df_tmp['no'] = df_tmp.index
    cols = df_tmp.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    df_tmp = df_tmp[cols]
    df_ucr_train = df_ucr_train.append(df_tmp)
    
# for ts_info in tqdm(ts_test_infos):
#     ts_name = ts_info[0]
#     fp = ts_info[1]
    
#     df_tmp = pd.read_csv(fp, sep='\t', header=None)
#     df_tmp['name'] = ts_name
#     df_tmp['no'] = df_tmp.index
#     cols = df_tmp.columns.tolist()
#     cols = cols[-2:] + cols[:-2]
#     df_tmp = df_tmp[cols]
#     df_ucr_test = df_ucr_test.append(df_tmp)
    

print("df_train shape: {}".format(df_ucr_train.shape))
# print("df_test shape: {}".format(df_ucr_test.shape))

  0%|          | 0/143 [00:00<?, ?it/s]

df_train shape: (64306, 2847)


In [5]:
df_ucr_train.head()

Unnamed: 0,name,no,0,1,2,3,4,5,6,7,...,2835,2836,2837,2838,2839,2840,2841,2842,2843,2844
0,Haptics,0,5,-1.047477,0.548336,-0.259562,-1.581216,-1.915218,-1.278188,-0.691525,...,,,,,,,,,,
1,Haptics,1,2,-1.018153,0.514133,-0.09737,-1.374493,-1.957364,-1.567188,-0.906353,...,,,,,,,,,,
2,Haptics,2,4,-1.183382,0.339636,0.045998,-1.080209,-1.934357,-1.972025,-1.412721,...,,,,,,,,,,
3,Haptics,3,3,-0.896969,0.186145,0.57829,0.392361,-0.15724,-0.818047,-1.374046,...,,,,,,,,,,
4,Haptics,4,5,-1.009424,0.614673,-0.397552,-1.766144,-1.906942,-1.145026,-0.7172,...,,,,,,,,,,


In [6]:
df_m4_raw.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V9911,V9912,V9913,V9914,V9915,V9916,V9917,V9918,V9919,V9920
0,H1,605.0,586.0,586.0,559.0,511.0,443.0,422.0,395.0,382.0,...,,,,,,,,,,
1,H2,3124.0,2990.0,2862.0,2809.0,2544.0,2201.0,1996.0,1861.0,1735.0,...,,,,,,,,,,
2,H3,1828.0,1806.0,1897.0,1750.0,1679.0,1620.0,1463.0,1342.0,1192.0,...,,,,,,,,,,
3,H4,6454.0,6324.0,6075.0,5949.0,5858.0,5579.0,5163.0,4790.0,4478.0,...,,,,,,,,,,
4,H5,4263.0,4297.0,4236.0,4080.0,3883.0,3672.0,3248.0,2841.0,2513.0,...,,,,,,,,,,


In [7]:
ar_m4 = df_m4_raw.iloc[:,3:].to_numpy()
ar_m4 = ar_m4.ravel()
print(ar_m4.shape)
ar_m4 = ar_m4[~np.isnan(ar_m4)]
print(ar_m4.shape)

(991700000,)
(23802047,)


In [8]:
ar_ucr = df_ucr_train.iloc[:,3:].to_numpy()
ar_ucr = ar_ucr.ravel()
print(ar_ucr.shape)
ar_ucr = ar_ucr[~np.isnan(ar_ucr)]
print(ar_ucr.shape)

(182886264,)
(25808184,)


In [55]:
ar_m4.mean()

4841.503654280829

In [9]:
ar_ucr.mean()

7.960112750455711

In [14]:
print(f"m4 val mean: {ar_m4.mean()}")
print(f"m4 val median: {np.median(ar_m4)}")
print(f"m4 val std: {ar_m4.std()}")
print(f"m4 val min: {ar_m4.min()}")
print(f"m4 val max: {ar_m4.max()}")
print("################")




print(f"ucr val mean: {ar_ucr.mean()}")
print(f"ucr val median: {np.median(ar_ucr)}")
print(f"ucr val std: {ar_ucr.std()}")
print(f"ucr val min: {ar_ucr.min()}")
print(f"ucr val std: {ar_ucr.max()}")

m4 val mean: 4841.503654280829
m4 val median: 3689.0
m4 val std: 5724.96078933796
m4 val min: 10.0
m4 val max: 703008.0
################
ucr val mean: 7.960112750455711
ucr val median: 0.0013769963
ucr val std: 99.66594793318704
ucr val min: -1110.8
ucr val std: 24929.0


In [15]:
ar_m4.shape

(23802047,)

In [16]:
ar_ucr.shape

(25808184,)