#Problem Statement

1. Data Acquisition (Using yfinance in Colab)

In [1]:
# Install the library in your Colab cell
!pip install yfinance

import yfinance as yf
import pandas as pd

# List of the 26 cryptocurrencies symbols (example list)
crypto_list = ['BTC-USD', 'ETH-USD', 'ADA-USD', 'XRP-USD'] # Add all 26 here

# Fetch historical data
data = yf.download(crypto_list, start='YYYY-MM-DD', end='YYYY-MM-DD')['Close']
# 'data' will be a pandas DataFrame with daily closing prices for all 26 coins




  data = yf.download(crypto_list, start='YYYY-MM-DD', end='YYYY-MM-DD')['Close']
[*********************100%***********************]  4 of 4 completed
ERROR:yfinance:
4 Failed downloads:
ERROR:yfinance:['XRP-USD', 'ADA-USD', 'ETH-USD', 'BTC-USD']: ValueError("time data 'YYYY-MM-DD' does not match format '%Y-%m-%d'")


2. Feature Engineering (Calculating Volatility)


In [2]:
import numpy as np

# Calculate daily logarithmic returns
log_returns = np.log(data / data.shift(1))

# Calculate a 30-day rolling standard deviation (volatility)
# This is a key feature you'll need to predict
volatility = log_returns.rolling(window=30).std() * np.sqrt(30) # Annualize if needed


In [10]:
# =========================
# 1. Setup
# =========================
!pip install yfinance ta scikit-learn pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# For reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# =========================
# 2. Load data
# =========================
# Option 1: load your own CSV from Colab files
# from google.colab import files
# uploaded = files.upload()
# df = pd.read_csv(list(uploaded.keys())[0])

# Option 2: direct path if file is already in environment
df = pd.read_csv("/content/crypto_prices.csv")  # change to your file name

# Expect columns like: Date, Symbol, Open, High, Low, Close, Volume, Marketcap
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values(["Symbol", "Date"]).reset_index(drop=True)

print(df.head())
print(df.isna().sum())

# =========================
# 3. Basic cleaning
# =========================
# Forward fill per symbol, then drop any remaining missing rows
df = df.groupby("Symbol").apply(lambda x: x.ffill().bfill()).reset_index(drop=True)
df = df.dropna().reset_index(drop=True)

# Keep a subset of columns
df = df[["Date", "Symbol", "Open", "High", "Low", "Close", "Volume", "Marketcap"]]

# =========================
# 4. Feature engineering
# =========================
def add_features(group, window=7):
    group = group.sort_values("Date")
    # Daily log returns
    group["log_return"] = np.log(group["Close"]).diff()
    # Rolling volatility (target): std of returns over 'window' days
    group["volatility_target"] = group["log_return"].rolling(window).std()
    # Moving averages
    group["ma_close"] = group["Close"].rolling(window).mean()
    group["ma_volume"] = group["Volume"].rolling(window).mean()
    # Liquidity ratio
    group["liquidity_ratio"] = group["Volume"] / group["Marketcap"]
    # Shift target so we predict next‑day volatility
    group["volatility_target"] = group["volatility_target"].shift(-1)
    return group

df_feat = df.groupby("Symbol").apply(add_features, window=7).reset_index(drop=True)

# Drop first/last rows with NaNs due to rolling/shift
df_feat = df_feat.dropna().reset_index(drop=True)

print(df_feat.head())

# =========================
# 5. Train/test split (time based)
# =========================
# Use one or a few symbols (optional)
symbols_to_use = df_feat["Symbol"].unique()[:5]
data = df_feat[df_feat["Symbol"].isin(symbols_to_use)].copy()

# Sort by date
data = data.sort_values("Date")

# Features and target
feature_cols = [
    "Open", "High", "Low", "Close", "Volume", "Marketcap",
    "log_return", "ma_close", "ma_volume", "liquidity_ratio"
]
X = data[feature_cols].values
y = data["volatility_target"].values

# Split by index (no shuffling)
split_idx = int(len(data) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# =========================
# 6. Scaling
# =========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =========================
# 7. Model training
# =========================
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
model.fit(X_train_scaled, y_train)

# =========================
# 8. Evaluation
# =========================
y_pred = model.predict(X_test_scaled)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")
print(f"R^2: {r2:.4f}")

# =========================
# 9. Simple plots
# =========================
test_dates = data["Date"].iloc[split_idx:]

plt.figure(figsize=(12, 5))
plt.plot(test_dates, y_test, label="True volatility")
plt.plot(test_dates, y_pred, label="Predicted volatility")
plt.xlabel("Date")
plt.ylabel("Volatility (rolling std)")
plt.title("True vs Predicted Volatility")
plt.legend()
plt.tight_layout()
plt.show()

# =========================
# 10. Predict function example
# =========================
def predict_next_volatility(latest_row):
    """
    latest_row: pandas Series with the same feature columns as in training.
    """
    x = latest_row[feature_cols].values.reshape(1, -1)
    x_scaled = scaler.transform(x)
    return model.predict(x_scaled)[0]

# Example usage on last row of dataset:
example_pred = predict_next_volatility(data.iloc[-1])
print("Example next‑day volatility prediction:", example_pred)




FileNotFoundError: [Errno 2] No such file or directory: '/content/crypto_prices.csv'