### **Linear Regression**

**0. Import**

In [1]:
import os
import gc
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

**1. 경로 설정**

In [2]:
# DATA_DIR = Path("/content/drive/MyDrive/COSE362/data/feature_engineering")
# OUTPUT_DIR = Path("/content/drive/MyDrive/COSE362/data/prediction_output")
DATA_DIR = Path("../feature_datasets")
OUTPUT_DIR = Path("results_lr")
RESULTS_DIR = OUTPUT_DIR / "results"  

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True) 



print(f"Data Source: {DATA_DIR}")
print(f"Output Path: {OUTPUT_DIR}")

Data Source: ../feature_datasets
Output Path: results_lr


**2. 데이터 로드**

In [3]:
def load_data(file_path):
    """
    Parquet 파일 로드
    """
    print(f"Loading {file_path.name}...")
    df = pd.read_parquet(file_path)
    
    # 날짜순 정렬
    if 'date_index' in df.columns:
        df = df.sort_values('date_index').reset_index(drop=True)
    
    # ========================================
    # pub_date → Date 변환
    # ========================================
    if 'pub_date' in df.columns:
        # pub_date 형식: '2019_12_30' (언더스코어로 구분)
        df['Date'] = pd.to_datetime(df['pub_date'], format='%Y_%m_%d')
        print(f"   Using 'pub_date' for Date column")
    else:
        # pub_date가 없으면 임시 날짜 생성 (fallback)
        print("   [Warning] No 'pub_date' found. Using default date range.")
        df['Date'] = pd.date_range(start='2017-01-01', periods=len(df), freq='D')
    
    print(f"   Loaded {len(df)} rows, {len(df.columns)} columns")
    print(f"   ✅ Date range: {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}")
    
    return df

**3. 전처리 & 가중치 생성 함수**

In [4]:
def preprocess_and_split(df, target_col='value'):
    """
    전처리 및 Train/Valid/Test 분할
    """
    
    # ========================================
    # 1. Target 생성
    # ========================================
    daily_prices = df[['date_index', target_col]].drop_duplicates().sort_values('date_index')
    daily_prices['target'] = daily_prices[target_col].shift(-1)
    
    df = df.drop(columns=['target'], errors='ignore')
    df = df.merge(daily_prices[['date_index', 'target']], on='date_index', how='left')
    df = df.dropna(subset=['target'])
    
    print(f"   After target creation: {len(df)} rows")
    
    # ========================================
    # 2. Sample Weight 계산
    # ========================================
    date_counts = df['date_index'].value_counts()
    df['sample_weight'] = df['date_index'].map(lambda x: 1.0 / date_counts[x])
    
    print(f"   Sample weights: min={df['sample_weight'].min():.4f}, "
          f"max={df['sample_weight'].max():.4f}, mean={df['sample_weight'].mean():.4f}")
    
    # ========================================
    # 3. ✅ Date 먼저 추출 (split용)
    # ========================================
    if 'Date' not in df.columns:
        raise ValueError("'Date' column not found in dataframe")
    
    dates = df['Date'].copy()
    
    # ✅ 즉시 Date 컬럼 제거
    df = df.drop(columns=['Date'])
    
    # ========================================
    # 4. 드롭할 컬럼 정의
    # ========================================
    cols_to_drop = [
        # 메타데이터
        'person',
        'person_id', 
        'article_id',
        
        # 날짜 (이미 제거됨)
        'pub_date',
        # 'Date',  ← 이미 위에서 제거했으므로 불필요
        
        # Target 관련
        'value',
        'target',     # y로 사용 (X에서만 제외)
        
        # Fear-Greed
        'fg_value',
        
        # Weight
        'sample_weight',
    ]
    
    # 실제 존재하는 컬럼만 필터링
    actual_drop = [c for c in cols_to_drop if c in df.columns]
    print(f"   Dropping columns: {actual_drop}")
    
    # ========================================
    # 5. X, y, weights 추출
    # ========================================
    X = df.drop(columns=actual_drop, errors='ignore')
    y = df['target'].copy()
    weights = df['sample_weight'].copy()
    
    # ✅ X에 datetime 컬럼이 없는지 확인
    datetime_cols = X.select_dtypes(include=['datetime64']).columns.tolist()
    if datetime_cols:
        print(f"   ⚠️ WARNING: Found datetime columns in X: {datetime_cols}")
        X = X.drop(columns=datetime_cols)
        print(f"   Removed datetime columns from X")
    
    print(f"   Feature columns ({len(X.columns)}): {list(X.columns[:10])}...")
    
    # ========================================
    # 6. Train/Valid/Test Split
    # ========================================
    train_mask = (dates <= '2018-12-31')
    valid_mask = (dates >= '2019-01-01') & (dates <= '2019-06-30')
    test_mask  = (dates >= '2019-07-01')
    
    print(f"   Train: {train_mask.sum()} rows")
    print(f"   Valid: {valid_mask.sum()} rows")
    print(f"   Test:  {test_mask.sum()} rows")
    
    return (
        (X[train_mask], y[train_mask], weights[train_mask]),
        (X[valid_mask], y[valid_mask], weights[valid_mask]),
        (X[test_mask], y[test_mask], weights[test_mask], dates[test_mask])
    )

**4. 모델 튜닝 및 학습 함수**

In [5]:
def train_linear_models(dataset_name, file_path):
    """
    Linear, Ridge, Lasso Grid Search (alpha = [0.1, 1.0, 10.0])
    총 7개 조합: Linear(1) + Ridge(3) + Lasso(3)
    
    ✅ 이미 결과 파일이 존재하면 스킵
    """
    
    # ========================================
    # 0. ✅ 결과 파일 존재 여부 확인
    # ========================================
    output_path = RESULTS_DIR / f"pred_linear_{dataset_name}.json"
    
    if output_path.exists():
        print(f"\n{'='*60}")
        print(f"⏭️  SKIPPING: {dataset_name}")
        print(f"{'='*60}")
        print(f"   Result already exists: {output_path.name}")
        
        # ✅ 기존 결과 파일에서 MSE 읽기 (optional)
        try:
            with open(output_path, 'r') as f:
                result_data = json.load(f)
            
            # MSE 계산 (저장된 예측값으로)
            actuals = [item['actual'] for item in result_data]
            preds = [item['predicted'] for item in result_data]
            mse = mean_squared_error(actuals, preds)
            
            # 어떤 모델이었는지는 모르니 "Cached" 표시
            print(f"   ✅ Cached Test MSE: {mse:.4f}")
            
            return "Cached", mse
            
        except Exception as e:
            print(f"   ⚠️ Warning: Could not read cached MSE: {e}")
            print(f"   Re-running experiment...")
            # 에러 시 계속 진행 (아래 코드 실행)
    
    # ========================================
    # 1. 데이터 로드 및 전처리
    # ========================================
    print(f"\n{'='*60}")
    print(f"Processing: {dataset_name}")
    print(f"{'='*60}")
    
    df = load_data(file_path)
    
    (X_train, y_train, w_train), \
    (X_valid, y_valid, w_valid), \
    (X_test, y_test, w_test, dates_test) = preprocess_and_split(df)
    
    del df
    gc.collect()
    
    # ========================================
    # 2. Grid Search 설정
    # ========================================
    alphas = [0.1]
    
    models_config = [
        ('Linear', LinearRegression(), None),
        ('Ridge', Ridge(), alphas),
        ('Lasso', Lasso(max_iter=2000), alphas)
    ]
    
    print(f"   Grid Search Configuration:")
    print(f"   - Linear: 1 combination")
    print(f"   - Ridge: {len(alphas)} combinations (alphas={alphas})")
    print(f"   - Lasso: {len(alphas)} combinations (alphas={alphas})")
    print(f"   - Total: {1 + len(alphas)*2} combinations\n")
    
    # ========================================
    # 3. Grid Search 실행
    # ========================================
    best_mse = float('inf')
    best_model = None
    best_info = ""
    
    for model_name, model_base, alpha_list in models_config:
        
        if alpha_list is None:
            print(f"   Training {model_name}...")
            
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', model_base)
            ])
            
            pipeline.fit(X_train, y_train, model__sample_weight=w_train)
            val_pred = pipeline.predict(X_valid)
            val_mse = mean_squared_error(y_valid, val_pred)
            
            print(f"      Valid MSE: {val_mse:.4f}")
            
            if val_mse < best_mse:
                best_mse = val_mse
                best_model = pipeline
                best_info = model_name
        
        else:
            for alpha in alpha_list:
                print(f"   Training {model_name}(alpha={alpha})...")
                
                if model_name == 'Ridge':
                    current_model = Ridge(alpha=alpha)
                else:
                    current_model = Lasso(alpha=alpha, max_iter=2000)
                
                pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('model', current_model)
                ])
                
                pipeline.fit(X_train, y_train, model__sample_weight=w_train)
                val_pred = pipeline.predict(X_valid)
                val_mse = mean_squared_error(y_valid, val_pred)
                
                print(f"      Valid MSE: {val_mse:.4f}")
                
                if val_mse < best_mse:
                    best_mse = val_mse
                    best_model = pipeline
                    best_info = f"{model_name}(alpha={alpha})"
    
    # ========================================
    # 4. Best Model 평가
    # ========================================
    print(f"\n   ✅ Best Model: {best_info}")
    print(f"   ✅ Validation MSE: {best_mse:.4f}")
    
    test_pred = best_model.predict(X_test)
    test_mse = mean_squared_error(y_test, test_pred)
    print(f"   ✅ Test MSE: {test_mse:.4f}")
    
    # ========================================
    # 5. ✅ 결과 저장 (RESULTS_DIR)
    # ========================================
    result_data = []
    for date, actual, pred in zip(dates_test, y_test, test_pred):
        result_data.append({
            "date": date.strftime('%Y-%m-%d'),
            "actual": float(actual),
            "predicted": float(pred)
        })
    
    # ✅ RESULTS_DIR에 저장
    with open(output_path, "w") as f:
        json.dump(result_data, f, indent=4)
    
    print(f"   Saved predictions to: results/{output_path.name}")
    
    # 메모리 정리
    del X_train, y_train, X_valid, y_valid, X_test, y_test
    gc.collect()
    
    return best_info, test_mse

**5. Main Execution Loop**

In [6]:
# ========================================
# Main Execution Loop
# ========================================

levels = ['B', 'C', 'D']
methods = ['headlines', 'chunking', 'bodyText', 'paragraphs']
types = ['pca', 'orig']

metrics_list = []

# Dataset A (Baseline)
path_A = DATA_DIR / "dataset_A.parquet"
if path_A.exists():
    print("\n" + "="*60)
    print("BASELINE: Dataset A")
    print("="*60)
    
    try:
        info, mse = train_linear_models("A", path_A)
        metrics_list.append({
            "Dataset": "A",
            "Method": "-",
            "Type": "-",
            "Best_Model": info,
            "Test_MSE": mse
        })
    except Exception as e:
        print(f"❌ Error on Dataset A: {e}")
        import traceback
        traceback.print_exc()
    finally:
        gc.collect()
else:
    print(f"Warning: {path_A} not found. Skipping Dataset A.")

# Dataset B, C, D
total_datasets = len(levels) * len(methods) * len(types)
current = 0

for level in levels:
    for method in methods:
        for dtype in types:
            current += 1
            fname = f"dataset_{level}_{method}_{dtype}.parquet"
            fpath = DATA_DIR / fname
            
            if not fpath.exists():
                print(f"\n[{current}/{total_datasets}] Skipping {fname}: File not found.")
                continue
            
            dname = f"{level}_{method}_{dtype}"
            
            try:
                info, mse = train_linear_models(dname, fpath)
                metrics_list.append({
                    "Dataset": level,
                    "Method": method,
                    "Type": dtype,
                    "Best_Model": info,
                    "Test_MSE": mse
                })
            except Exception as e:
                print(f"\n❌ Error on {dname}: {e}")
                import traceback
                traceback.print_exc()
            finally:
                gc.collect()

# ========================================
# 최종 결과 정리 및 저장
# ========================================
final_df = pd.DataFrame(metrics_list).sort_values("Test_MSE")

# ✅ OUTPUT_DIR (results_lr/)에 CSV 저장
csv_path = OUTPUT_DIR / "linear_evaluation_metrics.csv"
final_df.to_csv(csv_path, index=False)

print("\n" + "="*60)
print("ALL TASKS COMPLETED")
print("="*60)
print(f"\nResults saved to: {csv_path}")
print(f"\nTop 10 Models by Test MSE:")
print(final_df.head(10))

# 10개 60분


BASELINE: Dataset A

⏭️  SKIPPING: A
   Result already exists: pred_linear_A.json
   ✅ Cached Test MSE: 1015.6601

⏭️  SKIPPING: B_headlines_pca
   Result already exists: pred_linear_B_headlines_pca.json
   ✅ Cached Test MSE: 1105.7666

⏭️  SKIPPING: B_headlines_orig
   Result already exists: pred_linear_B_headlines_orig.json
   ✅ Cached Test MSE: 1098.6073

⏭️  SKIPPING: B_chunking_pca
   Result already exists: pred_linear_B_chunking_pca.json
   ✅ Cached Test MSE: 1104.6235

⏭️  SKIPPING: B_chunking_orig
   Result already exists: pred_linear_B_chunking_orig.json
   ✅ Cached Test MSE: 1099.9708

⏭️  SKIPPING: B_bodyText_pca
   Result already exists: pred_linear_B_bodyText_pca.json
   ✅ Cached Test MSE: 1096.2533

⏭️  SKIPPING: B_bodyText_orig
   Result already exists: pred_linear_B_bodyText_orig.json
   ✅ Cached Test MSE: 1098.0591

⏭️  SKIPPING: B_paragraphs_pca
   Result already exists: pred_linear_B_paragraphs_pca.json
   ✅ Cached Test MSE: 1101.5742

⏭️  SKIPPING: B_paragraphs_ori

KeyboardInterrupt: 