In [1]:
import os
import pandas as pd

def inspect_parquet(filename):
    path = os.path.join("../feature_datasets", filename)

    # Parquet 파일 읽기
    df = pd.read_parquet(path)

    # 열 이름 출력
    print(f"[{filename}] Columns:")
    for col in df.columns:
        print(col)

    # 첫 두 행 출력
    print("\nFirst 2 rows:")
    print(df.head(5))

    return df


# 사용 예시
#df = inspect_parquet("dataset_B_headlines_orig.parquet")

# Merge

In [1]:
import os
import pandas as pd
from pathlib import Path

# 현재 디렉토리 경로
current_dir = Path('.')

# results_로 시작하는 모든 폴더 찾기
results_folders = [f for f in current_dir.iterdir() if f.is_dir() and f.name.startswith('results_')]

print(f"Found {len(results_folders)} results_ folders:")
for folder in results_folders:
    print(f"  - {folder.name}")
print()

# 각 results_ 폴더를 순회하여 dataframe 로드
for folder in results_folders:
    # 폴더 내의 모든 CSV 파일 찾기
    csv_files = list(folder.glob('*_evaluation_metrics.csv'))
    
    for csv_file in csv_files:
        # model 이름 추출 (예: 'linear_evaluation_metrics.csv' -> 'linear')
        model_name = csv_file.stem.replace('_evaluation_metrics', '')
        
        # CSV 파일 읽기
        df = pd.read_csv(csv_file)
        
        # model column 추가 (어떤 model인지 구분하기 위해)
        df['model'] = model_name
        
        # dataframe을 {model}_df 형태로 저장
        globals()[f"{model_name}_df"] = df
        
        print(f"Loaded: {csv_file.name} from {folder.name}")
        print(f"  -> Saved as: {model_name}_df")
        print(f"  -> Shape: {df.shape}")
        print()

# 모든 {model}_df 형태의 dataframe들을 찾아서 merge
model_dfs = [v for k, v in globals().items() if k.endswith('_df') and isinstance(v, pd.DataFrame)]

if model_dfs:
    # 모든 dataframe을 세로로 연결 (row-wise concatenation)
    merged_df = pd.concat(model_dfs, ignore_index=True)
    
    # 결과를 evaluation_metrics.csv로 저장
    output_file = 'evaluation_metrics.csv'
    merged_df.to_csv(output_file, index=False)
    
    print(f"{'='*60}")
    print(f"Merged dataframe saved to: {output_file}")
    print(f"Total shape: {merged_df.shape}")
    print(f"Models included: {sorted(merged_df['model'].unique().tolist())}")
    print(f"{'='*60}")
    
    # 미리보기
    display(merged_df.head(10))
else:
    print("No CSV files found in results_ folders!")

Found 4 results_ folders:
  - results_lightgbm
  - results_lr
  - results_gru
  - results_sarimax

Loaded: lightgbm_evaluation_metrics.csv from results_lightgbm
  -> Saved as: lightgbm_df
  -> Shape: (25, 6)

Loaded: linear_evaluation_metrics.csv from results_lr
  -> Saved as: linear_df
  -> Shape: (25, 6)

Loaded: gru_evaluation_metrics.csv from results_gru
  -> Saved as: gru_df
  -> Shape: (25, 6)

Loaded: sarimax_evaluation_metrics.csv from results_sarimax
  -> Saved as: sarimax_df
  -> Shape: (13, 6)

Merged dataframe saved to: evaluation_metrics.csv
Total shape: (88, 8)
Models included: ['gru', 'lightgbm', 'linear', 'sarimax']


Unnamed: 0,Dataset,Method,Type,Model,MSE,model,Best_Model,Test_MSE
0,A,none,none,LightGBM,21608.745003,lightgbm,,
1,B,bodyText,orig,LightGBM,20816.544952,lightgbm,,
2,B,bodyText,pca,LightGBM,20840.110172,lightgbm,,
3,B,chunking,orig,LightGBM,20816.544952,lightgbm,,
4,B,chunking,pca,LightGBM,20822.397383,lightgbm,,
5,B,headlines,orig,LightGBM,20788.074689,lightgbm,,
6,B,headlines,pca,LightGBM,21132.148561,lightgbm,,
7,B,paragraphs,orig,LightGBM,20816.544952,lightgbm,,
8,B,paragraphs,pca,LightGBM,20807.381678,lightgbm,,
9,C,bodyText,orig,LightGBM,20932.902106,lightgbm,,
