In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 140)
DATA_PATH = None

In [2]:
if DATA_PATH is not None and Path(DATA_PATH).exists():
    p = Path(DATA_PATH)
    if p.suffix.lower() in {'.xlsx','.xls'}:
        import openpyxl
        df = pd.read_excel(p)
    else:
        df = pd.read_csv(p)
    data_source = 'external'
else:
    rng = np.random.default_rng(42)
    n = 1000
    date = pd.date_range('2023-01-01', periods=n, freq='D')
    category = rng.choice(['A','B','C','D'], size=n, p=[0.4,0.3,0.2,0.1])
    price = np.abs(rng.normal(100, 20, n)).round(2)
    volume = rng.lognormal(mean=10, sigma=0.5, size=n).astype(int)
    returns = rng.normal(0.001, 0.02, n)
    df = pd.DataFrame({'date':date,'category':category,'price':price,'volume':volume,'returns':returns})
    idx = rng.choice(df.index, size=int(0.03*n), replace=False)
    df.loc[idx, 'price'] = np.nan
    out = Path('data/synthetic_feature_eng.csv')
    out.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out, index=False)
    data_source = f'synthetic->{out.as_posix()}'
df.head(3)

Unnamed: 0,date,category,price,volume,returns
0,2023-01-01,C,85.16,21071,-0.012632
1,2023-01-02,B,118.49,17761,0.041628
2,2023-01-03,C,100.69,32802,0.000351


In [3]:
if 'price' in df.columns:
    df['log_price'] = np.log1p(df['price'])

In [4]:
if 'returns' not in df.columns and 'price' in df.columns:
    if 'date' in df.columns:
        tmp = df[['date','price']].sort_values('date')
        tmp['returns'] = tmp['price'].pct_change()
        df = df.merge(tmp[['date','returns']], on='date', how='left')
    else:
        df = df.sort_index()
        df['returns'] = df['price'].pct_change()
if 'returns' in df.columns:
    if 'date' in df.columns:
        df = df.sort_values('date')
    df['rolling_return_mean_5'] = df['returns'].rolling(5, min_periods=1).mean()

In [5]:
if 'category' in df.columns:
    dummies = pd.get_dummies(df['category'], prefix='cat')
    df = pd.concat([df, dummies], axis=1)

In [6]:
num_df = df.select_dtypes(include=[np.number])
if 'target' in num_df.columns:
    corr = num_df.corr(numeric_only=True)['target'].sort_values(ascending=False)
else:
    corr = num_df.corr(numeric_only=True)
corr.head() if hasattr(corr, 'head') else corr

Unnamed: 0,price,volume,returns,log_price,rolling_return_mean_5
price,1.0,-0.011582,-0.006722,0.987946,0.005524
volume,-0.011582,1.0,0.028983,-0.023193,0.033549
returns,-0.006722,0.028983,1.0,-0.0005,0.450038
log_price,0.987946,-0.023193,-0.0005,1.0,0.004003
rolling_return_mean_5,0.005524,0.033549,0.450038,0.004003,1.0


In [7]:
out = Path('data/processed')
out.mkdir(parents=True, exist_ok=True)
out_file = out / 'feature_engineered.csv'
df.to_csv(out_file, index=False)
out_file.as_posix()

'data/processed/feature_engineered.csv'