In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt

file_path = '/home/hwkang/SeqSNN/FORECAST_META.csv'
df = pd.read_csv(file_path)

In [3]:
pd.set_option('display.width', 1000)

# 최대 출력 행 수 설정 (None이면 모든 행 출력)
pd.set_option('display.max_rows', None)

# 최대 출력 열 수 설정 (None이면 모든 열 출력)
pd.set_option('display.max_columns', None)

# 열 내용의 최대 너비 설정
pd.set_option('display.max_colwidth', None)

In [4]:
def get_filtered_df(df, column='seasonal', gteq=None, lteq=None, file_names: list=None, freq: str=None):
    # if_univariate가 FALSE인 것들로 1차 필터링
    df = df[df['if_univariate'] == False]
    if column == 'seasonal':
        return df[df['seasonal'] == True]
    elif column == 'trend':
        return df[df['trend'] == True]
    elif column == 'stationary':
        if gteq is None or lteq is None:
            raise ValueError("gteq and lteq must be provided when column is 'stationary'")
        # gteq보다 크거나 같은 값을 갖는 행만 필터링
        if gteq is not None:
            filtered_df = df[(df['stationary'] >= gteq)]
        else:
            filtered_df = df
        # lteq보다 작은 값을 갖는 행만 필터링
        if lteq is not None:
            filtered_df = filtered_df[filtered_df['stationary'] <= lteq]
        return filtered_df
    elif column == 'transition':
        if gteq is None and lteq is None:
            raise ValueError("gteq and lteq must be provided when column is 'transition'")
        # gteq보다 크거나 같은 값을 갖는 행만 필터링
        if gteq is not None:
            filtered_df = df[(df['transition'] >= gteq)]
        else:
            filtered_df = df
        # lteq보다 작은 값을 갖는 행만 필터링
        if lteq is not None:
            filtered_df = filtered_df[filtered_df['transition'] <= lteq]
        return filtered_df
    elif column == 'shifting':
        if gteq is None and lteq is None:
            raise ValueError("gteq and lteq must be provided when column is 'shifting'")
        # gteq보다 크거나 같은 값을 갖는 행만 필터링
        if gteq is not None:
            filtered_df = df[(df['shifting'] >= gteq)]
        else:
            filtered_df = df
        if lteq is not None:
            filtered_df = filtered_df[filtered_df['shifting'] <= lteq]
        return filtered_df
    elif column == 'correlation':
        if gteq is None and lteq is None:
            raise ValueError("gteq and lteq must be provided when column is 'correlation'")
        # gteq보다 크거나 같은 값을 갖는 행만 필터링
        if gteq is not None:
            filtered_df = df[(df['correlation'] >= gteq)]
        else:
            filtered_df = df
        # lteq보다 작은 값을 갖는 행만 필터링
        if lteq is not None:
            filtered_df = filtered_df[filtered_df['correlation'] <= lteq]
        return filtered_df
    elif column == 'file_name':
        if file_names is None:
            raise ValueError("file_names must be provided when column is 'file_name'")
        return df[df['file_name'].isin(file_names)]
    elif column == 'freq':
        if freq is None:
            raise ValueError("freq must be provided when column is 'freq'")
        return df[df['freq'] == freq]

In [5]:
seasonal_df = get_filtered_df(df, column='seasonal')
print(f'{len(seasonal_df)}/{len(df)} seasonal data found')
seasonal_ts_names = seasonal_df['file_name'].tolist()

2/8093 seasonal data found


In [6]:
trend_df = get_filtered_df(df, column='trend')
print(f'{len(trend_df)}/{len(df)} trend data found')
trend_ts_names = trend_df['file_name'].tolist()

8/8093 trend data found


In [7]:
correlation_df = get_filtered_df(df, column='correlation', gteq=0.7, lteq=1.0)
print(f'{len(correlation_df)}/{len(df)} correlation data found')
correlation_ts_names = correlation_df['file_name'].tolist()
print(correlation_ts_names)

9/8093 correlation data found
['Electricity.csv', 'METR-LA.csv', 'NN5.csv', 'PEMS04.csv', 'PEMS08.csv', 'PEMS-BAY.csv', 'Solar.csv', 'Traffic.csv', 'Wike2000.csv']


In [8]:
freqs = ['hourly', 'daily']
for freq in freqs:
    freq_df = get_filtered_df(df, column='freq', freq=freq)
    print(f'{len(freq_df)}/{len(df)} {freq} data found')
    freq_ts_names = freq_df['file_name'].tolist()
    print(freq_ts_names)

6/8093 hourly data found
['Electricity.csv', 'ETTh1.csv', 'ETTh2.csv', 'AQShunyi.csv', 'AQWan.csv', 'Traffic.csv']
6/8093 daily data found
['Covid-19.csv', 'Exchange.csv', 'NASDAQ.csv', 'NN5.csv', 'NYSE.csv', 'Wike2000.csv']


In [11]:
# 'if_univariate'가 False인 행들을 'length' 오름차순 정렬
length_sorted_df = df[df['if_univariate'] == False].sort_values(by='length')
print(f'{len(length_sorted_df)}/{len(df)} univariate data found')

# file_name과 length 컬럼만 선택
length_df = length_sorted_df[['file_name', 'length']]
print(length_df)

25/8093 univariate data found
          file_name  length
8       FRED-MD.csv     728
12          NN5.csv     791
22     Wike2000.csv     792
11          ILI.csv     966
13         NYSE.csv    1243
10       NASDAQ.csv    1244
0      Covid-19.csv    1392
7      Exchange.csv    7588
3         ETTh1.csv   14400
4         ETTh2.csv   14400
14       PEMS04.csv   16992
20      Traffic.csv   17544
15       PEMS08.csv   17856
24       ZafNoo.csv   19225
1        CzeLan.csv   19934
2   Electricity.csv   26304
9       METR-LA.csv   34272
17     AQShunyi.csv   35064
18        AQWan.csv   35064
23         Wind.csv   48673
16     PEMS-BAY.csv   52116
19        Solar.csv   52560
21      Weather.csv   52696
6         ETTm2.csv   57600
5         ETTm1.csv   57600


In [13]:
# 파일 이름이 'Weater.csv'와 'Traffic.csv'인 행을 선택해 출력
weather_traffic_df = df[df['file_name'].isin(['Weather.csv', 'Traffic.csv'])]
print(weather_traffic_df)

      file_name    freq  if_univariate   size  length  trend  seasonal stationary  transition     shifting  correlation
20  Traffic.csv  hourly          False  large   17544  False     False   3.71E-08  0.01087732  0.066992351     0.813524
21  Weather.csv    mins          False  large   52696  False     False   1.04E-08  0.03678061  0.213569048     0.694155


In [20]:
# pandas로 .h5 파일을 읽어서 구조 출력
metr_la_path = '/home/hwkang/SeqSNN/data/metr-la.h5'

try:
    # 기본키로 읽기 시도
    metr_la_df = pd.read_hdf(metr_la_path, key='df')
except:
    try:
        # 다른 일반적인 키들 시도
        metr_la_df = pd.read_hdf(metr_la_path, key='data')
    except:
        try:
            metr_la_df = pd.read_hdf(metr_la_path, key='table')
        except:
            # h5py로 키 목록 확인
            import h5py
            with h5py.File(metr_la_path, 'r') as f:
                print("Available keys:", list(f.keys()))
                # 첫 번째 키 사용
                if len(f.keys()) > 0:
                    first_key = list(f.keys())[0]
                    print(f"Using key: {first_key}")
                    metr_la_df = pd.read_hdf(metr_la_path, key=first_key)

#metr_la_df.describe()
metr_la_df.head()

Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,765604,767471,716339,773906,765273,716331,771667,716337,769953,769402,769403,769819,769405,716941,717578,716960,717804,767572,767573,773012,773013,764424,769388,716328,717819,769941,760987,718204,718045,769418,768066,772140,773927,760024,774012,774011,767609,769359,760650,716956,769831,761604,717495,716554,773953,767470,716955,764949,773954,767366,769444,773939,774067,769443,767750,767751,767610,773880,764766,717497,717490,717491,717492,717493,765176,717498,717499,765171,718064,718066,765164,769431,769430,717610,767053,767621,772596,772597,767350,767351,716571,773023,767585,773024,717483,718379,717481,717480,717486,764120,772151,718371,717489,717488,717818,718076,718072,767455,767454,761599,717099,773916,716968,769467,717576,717573,717572,717571,717570,764760,718089,769847,717608,767523,716942,718090,769867,717472,717473,759591,764781,765099,762329,716953,716951,767509,765182,769358,772513,716958,718496,769346,773904,718499,764853,761003,717502,759602,717504,763995,717508,765265,773996,773995,717469,717468,764106,717465,764794,717466,717461,717460,717463,717462,769345,716943,772669,717582,717583,717580,716949,717587,772178,717585,716939,768469,764101,767554,773975,773974,717510,717513,717825,767495,767494,717821,717823,717458,717459,769926,764858,717450,717452,717453,759772,717456,771673,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,55.5,66.5,64.25,68.5,60.375,67.5,37.75,63.125,59.75,62.125,67.25,41.25,54.625,58.125,65.125,64.25,61.25,62.75,66.875,47.375,57.0,67.75,65.125,66.875,64.0,62.0,64.375,60.75,60.75,62.375,64.5,66.0,62.0,64.875,55.875,67.75,63.0,60.125,60.25,62.25,42.25,48.25,65.125,63.5,55.75,60.5,51.571429,60.25,63.875,59.875,64.625,64.0,52.5,66.25,68.25,68.875,66.375,64.5,67.875,67.25,63.125,66.25,64.625,59.125,68.25,52.25,58.125,61.25,66.125,69.375,69.75,63.5,64.25,68.0,56.875,64.375,65.25,65.75,61.25,63.625,67.0,55.25,66.875,65.375,65.125,57.625,69.125,66.5,62.0,67.375,60.125,65.5,63.75,66.25,65.25,0.0,0.0,66.875,63.625,55.75,67.375,65.75,63.125,66.0,68.625,61.5,67.125,69.25,65.375,68.375,61.375,53.75,65.25,69.375,62.125,50.125,61.75,66.875,67.125,58.5,66.125,66.375,65.75,61.75,67.25,65.0,66.375,63.875,67.875,63.625,57.625,64.625,59.75,63.5,58.125,66.75,66.25,60.625,63.75,69.5,61.25,53.125,61.375,63.125,63.0,63.375,54.75,65.125,64.0,67.375,67.125,68.25,67.625,69.375,65.5,55.0,70.0,68.25,62.0,64.5,58.375,64.5,63.625,63.75,53.625,69.875,64.75,62.375,64.75,52.125,61.625,63.0,67.142857,67.625,63.875,63.125,63.375,64.125,66.875,69.875,67.75,62.0,66.75,57.625,52.625,69.0,43.5,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,58.777778,58.444444,66.444444,65.444444,56.111111,65.444444,39.0,63.777778,63.111111,66.111111,65.777778,56.777778,50.777778,56.222222,66.888889,54.444444,67.888889,61.555556,67.555556,46.777778,67.777778,67.888889,68.0,67.444444,66.444444,67.222222,63.0,64.333333,64.0,61.888889,66.777778,64.555556,64.444444,64.333333,66.444444,64.111111,58.888889,65.222222,60.333333,63.111111,52.333333,52.111111,68.0,56.0,60.888889,60.222222,44.444444,61.222222,62.333333,57.777778,62.111111,69.444444,34.666667,65.777778,67.111111,67.555556,57.888889,66.777778,65.555556,68.222222,63.333333,62.333333,67.777778,61.888889,67.777778,54.666667,46.111111,61.333333,68.111111,64.888889,66.777778,64.777778,66.0,64.0,59.777778,65.333333,66.888889,65.222222,61.777778,66.444444,65.888889,56.777778,69.444444,67.0,65.777778,57.444444,69.555556,69.111111,59.555556,67.333333,63.111111,62.777778,63.444444,64.222222,65.333333,0.0,0.0,67.333333,57.888889,60.555556,63.555556,68.111111,57.777778,67.777778,65.0,62.777778,68.444444,69.555556,62.555556,66.333333,58.111111,54.666667,67.666667,68.444444,62.0,55.222222,67.222222,67.111111,68.777778,63.0,61.333333,64.666667,55.555556,63.333333,62.333333,68.888889,67.222222,63.555556,68.222222,61.555556,69.0,67.444444,61.333333,62.888889,61.111111,64.222222,68.111111,64.555556,67.777778,67.75,58.444444,49.0,58.666667,58.555556,65.125,64.5,52.555556,68.888889,63.444444,65.222222,65.777778,69.222222,69.222222,69.333333,68.0,50.0,69.111111,66.777778,66.555556,67.333333,55.333333,66.555556,66.555556,65.777778,59.888889,66.888889,67.444444,62.222222,62.222222,52.333333,47.666667,67.111111,64.5,69.111111,63.888889,60.444444,65.777778,63.888889,69.333333,63.0,67.666667,67.333333,66.666667,65.777778,51.666667,69.111111,39.888889,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,61.375,62.375,64.75,65.25,61.875,67.875,35.25,63.875,35.625,65.0,69.125,49.125,55.625,59.125,67.875,63.125,63.0,64.5,59.0,47.75,61.571429,66.5,68.5,65.0,66.25,51.25,63.875,64.875,65.625,64.625,65.857143,65.875,65.25,64.125,65.75,67.125,61.25,59.25,58.142857,59.625,43.625,50.625,66.125,59.125,62.375,63.625,51.25,62.625,61.5,64.375,60.375,58.75,50.0,67.375,63.375,63.0,65.625,66.625,67.5,69.0,60.625,59.875,66.0,60.5,69.25,54.375,58.875,59.375,60.5,61.0,56.875,66.0,64.5,59.375,60.625,61.125,66.75,63.0,63.625,66.75,61.375,56.875,69.25,64.75,65.75,58.375,69.75,67.25,63.125,65.25,59.5,61.5,65.125,63.875,67.375,0.0,0.0,69.625,69.75,54.875,65.25,67.5,62.625,66.375,65.75,61.75,64.0,66.0,64.75,68.5,61.5,50.875,64.75,68.0,63.125,56.5,52.125,66.0,67.0,60.25,60.75,63.0,62.125,61.625,65.375,64.125,65.625,64.125,68.375,65.25,57.125,68.125,60.875,58.125,59.75,68.0,67.25,61.875,64.375,69.0,59.625,57.75,59.5,64.125,62.5,64.25,54.125,65.625,55.833333,65.5,66.375,64.875,67.0,69.5,67.625,54.875,70.0,64.0,67.375,65.25,58.125,64.625,67.375,66.75,56.0,68.75,65.625,61.0,63.5,56.125,61.375,64.125,66.0,68.625,67.25,64.0,66.875,65.0,65.5,54.875,68.75,67.375,65.75,56.25,49.375,67.25,38.25,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
import h5py
import numpy as np
import pandas as pd

def read_h5_safe(file_path):
    """안전하게 h5 파일을 읽는 함수"""
    with h5py.File(file_path, 'r') as f:
        print("Available datasets:", list(f.keys()))
        
        # 데이터셋 탐색
        for key in f.keys():
            dataset = f[key]
            print(f"\n{key}:")
            print(f"  Shape: {dataset.shape}")
            print(f"  Dtype: {dataset.dtype}")
            
            # 데이터 읽기
            data = dataset[...]
            
            # bytes 문자열을 일반 문자열로 변환
            if data.dtype.char == 'S':  # bytes string
                try:
                    data = np.array([item.decode('utf-8') if isinstance(item, bytes) else item for item in data.flatten()])
                    data = data.reshape(dataset.shape)
                except:
                    pass
            
            # DataFrame으로 변환
            if len(data.shape) == 2:
                df = pd.DataFrame(data)
                return df
            elif len(data.shape) == 1:
                df = pd.DataFrame(data, columns=[key])
                return df
    
    return None

# 사용
metr_la_df = read_h5_safe(metr_la_path)

Available datasets: ['df']

df:


AttributeError: 'Group' object has no attribute 'shape'