<a href="https://colab.research.google.com/github/noamgafni/STA160Project/blob/main/STA160_Project_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
import os, json

uploaded = files.upload()

os.makedirs('/root/.kaggle', exist_ok=True)
os.replace('kaggle.json', '/root/.kaggle/kaggle.json')
os.chmod('/root/.kaggle/kaggle.json', 0o600)

with open('/root/.kaggle/kaggle.json','r') as f:
    creds = json.load(f)
os.environ['KAGGLE_USERNAME'] = creds['username']
os.environ['KAGGLE_KEY'] = creds['key']

print("Kaggle API set up ✔")


Saving kaggle.json to kaggle.json
Kaggle API set up ✔


In [3]:
!pip -q install kagglehub
import kagglehub, os, glob

DS = "isaaclopgu/cryptocurrency-historical-prices-top-100-2025"
path = kagglehub.dataset_download(DS)

print("Path to dataset files:", path)
print("Sample files:")
for p in glob.glob(os.path.join(path, "**/*"), recursive=True)[:10]:
    print("  ", p)


Downloading from https://www.kaggle.com/api/v1/datasets/download/isaaclopgu/cryptocurrency-historical-prices-top-100-2025?dataset_version_number=67...


100%|██████████| 12.5M/12.5M [00:00<00:00, 77.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/isaaclopgu/cryptocurrency-historical-prices-top-100-2025/versions/67
Sample files:
   /root/.cache/kagglehub/datasets/isaaclopgu/cryptocurrency-historical-prices-top-100-2025/versions/67/Crypto_historical_data.csv


In [4]:
import pandas as pd, glob, os

csv_paths = glob.glob(os.path.join(path, "**/*.csv"), recursive=True)
print("CSV count:", len(csv_paths))
pd.read_csv(csv_paths[0]).head()


CSV count: 1


Unnamed: 0,Date,Open,High,Low,Close,Volume,ticker,name
0,2017-11-09 00:00:00+00:00,0.001207,0.001415,0.001181,0.001415,6259550,DOGE-USD,Dogecoin
1,2017-11-10 00:00:00+00:00,0.001421,0.001431,0.001125,0.001163,4246520,DOGE-USD,Dogecoin
2,2017-11-11 00:00:00+00:00,0.001146,0.001257,0.001141,0.001201,2231080,DOGE-USD,Dogecoin
3,2017-11-12 00:00:00+00:00,0.001189,0.00121,0.001002,0.001038,3288960,DOGE-USD,Dogecoin
4,2017-11-13 00:00:00+00:00,0.001046,0.001212,0.001019,0.001211,2481270,DOGE-USD,Dogecoin


In [5]:
import pandas as pd, numpy as np, re

def symbol_from_filename(p):
    b = os.path.basename(p)
    s = re.sub(r'\.csv$','', b)
    s = re.sub(r'[^A-Za-z0-9]','', s)
    return s.upper()

def load_and_standardize(p):
    df = pd.read_csv(p)
    df.columns = [c.strip().lower() for c in df.columns]

    rename = {}
    for c in df.columns:
        if c in ["date","datetime","timestamp","time"]: rename[c] = "timestamp"
        elif c in ["open","o"]: rename[c] = "open"
        elif c in ["high","h"]: rename[c] = "high"
        elif c in ["low","l"]: rename[c] = "low"
        elif c in ["close","c","price","adj close","adjusted close"]: rename[c] = "close"
        elif c in ["volume","vol","base volume","volume usd","quote_volume"]: rename[c] = "volume"
        elif c in ["symbol","ticker","asset","coin"]: rename[c] = "symbol"

    df = df.rename(columns=rename)
    if "symbol" not in df.columns:
        df["symbol"] = symbol_from_filename(p)

    keep = [c for c in ["timestamp","open","high","low","close","volume","symbol"] if c in df.columns]
    df = df[keep].copy()

    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
    for c in ["open","high","low","close","volume"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

frames = []
for p in csv_paths:
    try:
        frames.append(load_and_standardize(p))
    except Exception as e:
        print("Skip:", p, "->", e)

raw = pd.concat(frames, ignore_index=True)
raw.shape


(343372, 7)

In [6]:
df = raw.dropna(subset=["timestamp","open","high","low","close"]).copy()
if "volume" in df.columns:
    df["volume"] = df["volume"].fillna(0)

# Sort & drop duplicates
df = df.sort_values(["symbol","timestamp"]).drop_duplicates(subset=["symbol","timestamp"])

# Hard rule violations
bad = (
    (df["high"] < df["low"]) |
    (df["open"] < df["low"]) | (df["open"] > df["high"]) |
    (df["close"] < df["low"]) | (df["close"] > df["high"]) |
    (df[["open","high","low","close"]] <= 0).any(axis=1) |
    (df["volume"] < 0)
)
print("Dropping bad rows:", int(bad.sum()))
df = df.loc[~bad].copy()

# Align to daily frequency (7 days/week)
def to_daily(g):
    g = g.set_index("timestamp").sort_index().asfreq("D")
    return g

df = (df.groupby("symbol", group_keys=False)
        .apply(to_daily)
        .reset_index())

# Remove rows with missing prices after alignment
df = df.dropna(subset=["open","high","low","close"]).copy()

print("Clean shape:", df.shape)
df.head()

Dropping bad rows: 95
Clean shape: (311917, 7)


  .apply(to_daily)


Unnamed: 0,timestamp,open,high,low,close,volume,symbol
0,2025-10-02 00:00:00+00:00,0.2308,0.750199,0.2308,0.60125,1397452000.0,2Z-USD
1,2025-10-03 00:00:00+00:00,0.601254,0.616968,0.511807,0.522214,568815600.0,2Z-USD
2,2025-10-04 00:00:00+00:00,0.522214,0.527714,0.475263,0.4975,235916100.0,2Z-USD
3,2025-10-05 00:00:00+00:00,0.497499,0.510207,0.482347,0.491295,241544900.0,2Z-USD
4,2025-10-06 00:00:00+00:00,0.491295,0.495996,0.476693,0.495729,287302000.0,2Z-USD


In [7]:
df = df.sort_values(['symbol','timestamp']).reset_index(drop=True)

# Log return
df['log_ret'] = df.groupby('symbol')['close'].transform(
    lambda s: np.log(s / s.shift(1))
)

# Next-day return
df['log_ret_t1'] = df.groupby('symbol')['log_ret'].shift(-1)

first_rows = df.groupby('symbol', as_index=False).head(1)
assert first_rows['log_ret'].isna().all()

df = df.replace([np.inf, -np.inf], np.nan)

df.tail()


Unnamed: 0,timestamp,open,high,low,close,volume,symbol,log_ret,log_ret_t1
311912,2024-09-19 00:00:00+00:00,7.988627,7.988627,7.988627,7.988627,0.0,ZORA-USD,0.0,0.0
311913,2024-09-20 00:00:00+00:00,7.988627,7.988627,7.988627,7.988627,0.0,ZORA-USD,0.0,0.0
311914,2024-09-21 00:00:00+00:00,7.988627,7.988627,7.988627,7.988627,0.0,ZORA-USD,0.0,0.0
311915,2024-09-22 00:00:00+00:00,7.988627,7.988627,7.988627,7.988627,0.0,ZORA-USD,0.0,0.0
311916,2024-09-23 00:00:00+00:00,7.988627,7.988627,7.988627,7.988627,0.0,ZORA-USD,0.0,


In [8]:
# Rolling 95th percentile of absolute returns per symbol
def rolling_abs_q95(s, lookback=252):
    return s.abs().rolling(lookback, min_periods=100).quantile(0.95)

df['abs_q95'] = df.groupby('symbol')['log_ret'].transform(rolling_abs_q95)

# Tail-event label
df['tail_event_t1'] = (
    df['log_ret_t1'].abs() >= df.groupby('symbol')['abs_q95'].shift(1)
).astype('Int64')

# Drop rows without targets
df_model = df.dropna(subset=['log_ret', 'log_ret_t1', 'abs_q95']).copy()

print(df_model[['symbol','timestamp','log_ret','log_ret_t1','abs_q95','tail_event_t1']].head())


    symbol                 timestamp   log_ret  log_ret_t1   abs_q95  \
132  A-USD 2019-07-25 00:00:00+00:00  0.011479    0.347248  1.058644   
133  A-USD 2019-07-26 00:00:00+00:00  0.347248   -0.101958  1.057311   
134  A-USD 2019-07-27 00:00:00+00:00 -0.101958    0.327572  1.056068   
135  A-USD 2019-07-28 00:00:00+00:00  0.327572   -0.202722  1.054825   
136  A-USD 2019-07-29 00:00:00+00:00 -0.202722   -0.003537  1.053581   

     tail_event_t1  
132              0  
133              0  
134              0  
135              0  
136              0  


In [9]:
if 'df' in globals():
    base = df
elif 'df_model' in globals():
    base = df_model
elif 'out' in globals():
    base = out
else:
    raise RuntimeError("No in-memory DataFrame found. Make sure `df` (or `df_model`/`out`) exists.")

oh = base[['timestamp','symbol','volume']].copy()
oh['timestamp'] = pd.to_datetime(oh['timestamp'], errors='coerce', utc=True)
oh['volume'] = pd.to_numeric(oh['volume'], errors='coerce')

vol_profile = (
    oh.assign(is_zero = (oh['volume'].fillna(0) == 0))
      .groupby('symbol', as_index=False)
      .agg(days=('timestamp','count'),
           zero_days=('is_zero','sum'),
           zero_share=('is_zero','mean'),
           med_vol=('volume','median'),
           mean_vol=('volume','mean'))
      .sort_values(['zero_days','zero_share'], ascending=False)
)

bad_syms = vol_profile.loc[vol_profile['zero_days'] > 0, 'symbol'].tolist()

print(f"Total symbols: {vol_profile['symbol'].nunique()}")
print(f"Symbols with ANY zero-volume day: {len(bad_syms)}")
print("First 20 flagged symbols:", bad_syms[:20])

display(vol_profile.head(20))

Total symbols: 212
Symbols with ANY zero-volume day: 87
First 20 flagged symbols: ['ARB-USD', 'A-USD', 'UBTC-USD', 'USDF-USD', 'TIA-USD', 'SKY-USD', 'JUP-USD', 'MNT-USD', 'MYX-USD', 'TRUMP-USD', 'APE-USD', 'SBTC-USD', 'HYPE-USD', 'NFT-USD', 'GRT-USD', 'USDE-USD', 'VSN-USD', 'OSETH-USD', 'WAL-USD', 'ETHX-USD']


Unnamed: 0,symbol,days,zero_days,zero_share,med_vol,mean_vol
10,ARB-USD,2898,2158,0.744651,0.0,46449.48
1,A-USD,2065,1282,0.620823,0.0,20503.66
166,UBTC-USD,2877,1147,0.398679,19584.0,124030.6
175,USDF-USD,1889,1137,0.601906,0.0,521214.8
159,TIA-USD,1344,640,0.47619,128.5,18992110.0
143,SKY-USD,2916,555,0.190329,129709.5,794033.4
91,JUP-USD,2845,509,0.17891,6661.0,425917.4
107,MNT-USD,468,419,0.895299,0.0,29473.02
110,MYX-USD,731,369,0.504788,0.0,26580.11
162,TRUMP-USD,2722,363,0.133358,401.0,15133.69


In [10]:
# === Export only the cleaned OHLCV columns (timestamp, open, high, low, close, volume, symbol) ===
import os

# Pick whichever DataFrame you have in memory
base = None
if 'df' in globals():
    base = df
elif 'df_model' in globals():
    base = df_model
elif 'raw' in globals():
    base = raw
else:
    raise RuntimeError("No DataFrame found (df / df_model / raw).")

# Select ONLY the OHLCV + symbol columns
cols = ["timestamp", "open", "high", "low", "close", "volume", "symbol"]
out = base[cols].copy()

# Save
os.makedirs("/content/share", exist_ok=True)
out.to_parquet("/content/share/ohlcv_clean.parquet", index=False)
out.to_csv("/content/share/ohlcv_clean.csv", index=False)
out["symbol"].drop_duplicates().sort_values().to_csv("/content/share/symbols.txt", index=False, header=False)

print("Saved to /content/share:")
!ls -lh /content/share


Saved to /content/share:
total 46M
-rw-r--r-- 1 root root  36M Nov  9 01:20 ohlcv_clean.csv
-rw-r--r-- 1 root root  11M Nov  9 01:20 ohlcv_clean.parquet
-rw-r--r-- 1 root root 1.9K Nov  9 01:20 symbols.txt


In [11]:
# ============================================
# NEW CELL: VOLATILITY FEATURE ENGINEERING
# Insert this after Cell 5 (returns) and before Cell 6 (tail labeling)
# ============================================

import numpy as np
import pandas as pd

print("=== Creating Volatility Features ===\n")

# Ensure data is sorted
df = df.sort_values(['symbol', 'timestamp']).reset_index(drop=True)

# ============================================
# 1. CLOSE-TO-CLOSE VOLATILITY (Standard Historical Vol)
# ============================================
def calc_close_volatility(group, windows=[7, 21, 30, 60]):
    """Rolling standard deviation of log returns"""
    log_ret = group['log_ret']
    result = {}
    for w in windows:
        result[f'vol_close_{w}d'] = log_ret.rolling(w, min_periods=max(5, w//2)).std()
    return pd.DataFrame(result, index=group.index)

close_vol = df.groupby('symbol', group_keys=False).apply(calc_close_volatility)
for col in close_vol.columns:
    df[col] = close_vol[col].values

print(f"✓ Close-to-close volatility: {len(close_vol.columns)} features")

# ============================================
# 2. PARKINSON VOLATILITY (High-Low Range Estimator)
# More efficient than close-to-close, uses intraday range
# ============================================
def calc_parkinson_vol(group, windows=[7, 21, 30, 60]):
    """
    Parkinson (1980) volatility estimator
    More efficient than close-to-close when no drift
    Formula: sqrt(1/(4*ln(2)) * (ln(H/L))^2)
    """
    hl_ratio = np.log(group['high'] / group['low'])
    parkinson_sq = (hl_ratio ** 2) / (4 * np.log(2))

    result = {}
    for w in windows:
        result[f'vol_parkinson_{w}d'] = np.sqrt(
            parkinson_sq.rolling(w, min_periods=max(5, w//2)).mean()
        )
    return pd.DataFrame(result, index=group.index)

park_vol = df.groupby('symbol', group_keys=False).apply(calc_parkinson_vol)
for col in park_vol.columns:
    df[col] = park_vol[col].values

print(f"✓ Parkinson volatility: {len(park_vol.columns)} features")

# ============================================
# 3. GARMAN-KLASS VOLATILITY (Uses OHLC)
# Most efficient unbiased estimator using all OHLC data
# ============================================
def calc_garman_klass_vol(group, windows=[7, 21, 30, 60]):
    """
    Garman-Klass (1980) volatility estimator
    Uses OHLC data for more efficient estimation
    """
    h = np.log(group['high'] / group['open'])
    l = np.log(group['low'] / group['open'])
    c = np.log(group['close'] / group['open'])

    # Garman-Klass formula
    gk_sq = 0.5 * (h - l)**2 - (2*np.log(2) - 1) * c**2

    result = {}
    for w in windows:
        result[f'vol_gk_{w}d'] = np.sqrt(
            gk_sq.rolling(w, min_periods=max(5, w//2)).mean()
        )
    return pd.DataFrame(result, index=group.index)

gk_vol = df.groupby('symbol', group_keys=False).apply(calc_garman_klass_vol)
for col in gk_vol.columns:
    df[col] = gk_vol[col].values

print(f"✓ Garman-Klass volatility: {len(gk_vol.columns)} features")

# ============================================
# 4. EWMA VOLATILITY (Exponentially Weighted Moving Average)
# Gives more weight to recent observations
# ============================================
def calc_ewma_vol(group, spans=[7, 21, 60]):
    """
    EWMA volatility - weights recent data more heavily
    Common in risk management (e.g., RiskMetrics uses span=94 for daily)
    """
    log_ret_sq = group['log_ret'] ** 2
    result = {}
    for span in spans:
        result[f'vol_ewma_{span}d'] = np.sqrt(
            log_ret_sq.ewm(span=span, min_periods=max(3, span//3)).mean()
        )
    return pd.DataFrame(result, index=group.index)

ewma_vol = df.groupby('symbol', group_keys=False).apply(calc_ewma_vol)
for col in ewma_vol.columns:
    df[col] = ewma_vol[col].values

print(f"✓ EWMA volatility: {len(ewma_vol.columns)} features")

# ============================================
# 5. REALIZED VOLATILITY (Intraday Range as %)
# Simple but effective: (High - Low) / Close
# ============================================
def calc_realized_vol(group, windows=[7, 21, 30, 60]):
    """
    Realized intraday volatility as percentage
    Measures average daily range
    """
    daily_range_pct = (group['high'] - group['low']) / group['close']
    result = {}
    for w in windows:
        result[f'vol_realized_{w}d'] = daily_range_pct.rolling(w, min_periods=max(5, w//2)).mean()
    return pd.DataFrame(result, index=group.index)

real_vol = df.groupby('symbol', group_keys=False).apply(calc_realized_vol)
for col in real_vol.columns:
    df[col] = real_vol[col].values

print(f"✓ Realized volatility: {len(real_vol.columns)} features")

# ============================================
# 6. VOLATILITY REGIME FEATURES
# Detect high/low volatility periods
# ============================================
def calc_vol_regime(group):
    """
    Volatility regime indicators:
    - Short/long volatility ratios
    - Percentile ranks
    - Regime changes
    """
    vol_7d = group['vol_close_7d']
    vol_60d = group['vol_close_60d']

    result = pd.DataFrame(index=group.index)

    # Ratio of short to long-term vol (vol clustering indicator)
    result['vol_ratio_7_60'] = vol_7d / vol_60d

    # Percentile rank of current vol in 252-day window (1 year)
    result['vol_percentile'] = vol_7d.rolling(252, min_periods=100).apply(
        lambda x: pd.Series(x).rank(pct=True).iloc[-1] if len(x) > 0 else np.nan,
        raw=False
    )

    # High volatility regime indicator (>75th percentile)
    vol_75th = vol_7d.rolling(252, min_periods=100).quantile(0.75)
    result['vol_regime_high'] = (vol_7d > vol_75th).astype(int)

    return result

vol_regime = df.groupby('symbol', group_keys=False).apply(calc_vol_regime)
for col in vol_regime.columns:
    df[col] = vol_regime[col].values

print(f"✓ Volatility regime features: {len(vol_regime.columns)} features")

# ============================================
# 7. VOLATILITY OF VOLATILITY (Vol-of-Vol)
# Measures stability/instability of volatility itself
# ============================================
def calc_vol_of_vol(group, windows=[21, 60]):
    """
    Volatility of volatility - measures how unstable vol is
    High vol-of-vol often precedes tail events
    """
    vol = group['vol_close_7d']
    result = {}
    for w in windows:
        result[f'volvol_{w}d'] = vol.rolling(w, min_periods=max(10, w//2)).std()
    return pd.DataFrame(result, index=group.index)

vv = df.groupby('symbol', group_keys=False).apply(calc_vol_of_vol)
for col in vv.columns:
    df[col] = vv[col].values

print(f"✓ Vol-of-vol features: {len(vv.columns)} features")

# ============================================
# 8. VOLUME-WEIGHTED VOLATILITY
# Combines price volatility with volume patterns
# ============================================
def calc_volume_vol(group, windows=[7, 21, 30]):
    """
    Volume-weighted volatility indicators
    High vol + high volume = more reliable signal
    """
    abs_ret = group['log_ret'].abs()
    vol = group['volume']
    vol_normalized = vol / vol.rolling(60, min_periods=30).mean()  # Normalize volume

    result = {}
    for w in windows:
        # Volatility weighted by relative volume
        result[f'vol_volume_wtd_{w}d'] = (
            (abs_ret * vol_normalized).rolling(w, min_periods=max(3, w//2)).mean()
        )
    return pd.DataFrame(result, index=group.index)

vol_vol = df.groupby('symbol', group_keys=False).apply(calc_volume_vol)
for col in vol_vol.columns:
    df[col] = vol_vol[col].values

print(f"✓ Volume-weighted volatility: {len(vol_vol.columns)} features")

# ============================================
# CLEAN UP & VALIDATION
# ============================================

# Replace infinities with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Count total volatility features added
vol_cols = [c for c in df.columns if c.startswith('vol_') or c.startswith('volvol_')]
print(f"\n{'='*50}")
print(f"TOTAL VOLATILITY FEATURES CREATED: {len(vol_cols)}")
print(f"{'='*50}\n")

# Show summary statistics for key features
print("Sample volatility feature statistics:")
sample_cols = ['vol_close_21d', 'vol_parkinson_21d', 'vol_gk_21d',
               'vol_ewma_21d', 'vol_ratio_7_60', 'volvol_21d']
existing_sample = [c for c in sample_cols if c in df.columns]
if existing_sample:
    print(df[existing_sample].describe().round(6))

# Validate no lookahead bias
print("\n=== Lookahead Bias Check ===")
first_rows = df.groupby('symbol').head(1)
print(f"First row per symbol with non-NaN vol features: {first_rows[vol_cols].notna().any(axis=1).sum()}")
print("(Should be 0 or very few - features need historical data)")

print("\n✓ Volatility feature engineering complete!")
print(f"Shape: {df.shape}")

=== Creating Volatility Features ===



  close_vol = df.groupby('symbol', group_keys=False).apply(calc_close_volatility)


✓ Close-to-close volatility: 4 features


  park_vol = df.groupby('symbol', group_keys=False).apply(calc_parkinson_vol)


✓ Parkinson volatility: 4 features


  gk_vol = df.groupby('symbol', group_keys=False).apply(calc_garman_klass_vol)


✓ Garman-Klass volatility: 4 features


  ewma_vol = df.groupby('symbol', group_keys=False).apply(calc_ewma_vol)


✓ EWMA volatility: 3 features


  real_vol = df.groupby('symbol', group_keys=False).apply(calc_realized_vol)


✓ Realized volatility: 4 features


  vol_regime = df.groupby('symbol', group_keys=False).apply(calc_vol_regime)


✓ Volatility regime features: 3 features


  vv = df.groupby('symbol', group_keys=False).apply(calc_vol_of_vol)


✓ Vol-of-vol features: 2 features


  vol_vol = df.groupby('symbol', group_keys=False).apply(calc_volume_vol)


✓ Volume-weighted volatility: 3 features

TOTAL VOLATILITY FEATURES CREATED: 27

Sample volatility feature statistics:
       vol_close_21d  vol_parkinson_21d     vol_gk_21d   vol_ewma_21d  \
count  309808.000000      310018.000000  310018.000000  310439.000000   
mean        0.066441           0.067144       0.066602       0.068444   
std         0.127231           0.099616       0.095731       0.124613   
min         0.000000           0.000000       0.000000       0.000000   
25%         0.027073           0.027468       0.026661       0.028581   
50%         0.044559           0.045401       0.044966       0.045737   
75%         0.068972           0.072356       0.072835       0.070852   
max         4.652812           2.960816       2.929033       5.472524   

       vol_ratio_7_60     volvol_21d  
count   303776.000000  308968.000000  
mean         0.894183       0.028020  
std          0.479042       0.087720  
min          0.000000       0.000000  
25%          0.565602       