In [None]:
import yfinance as yf
import torch
import torch.nn as nn
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, mean_absolute_percentage_error
from datetime import datetime, timedelta
import warnings
import ta
warnings.filterwarnings('ignore')

# ===== 0. Set Random Seeds for Reproducibility =====
seed_value = 42
np.random.seed(seed_value)
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# ===== 1. Configuration: Define Stocks and Timeframe =====
tickers = ['AAPL', 'MSFT', 'GOOGL', 'JPM', 'V', 'JNJ', 'TSLA']
stock_to_id = {ticker: i for i, ticker in enumerate(tickers)}
id_to_stock = {i: ticker for ticker, i in stock_to_id.items()}

start_date = datetime(2015, 1, 1)
end_date = datetime.now()
seq_len = 60

# ===== 2. Data Processing Function for a Single Ticker =====
def process_single_ticker(ticker, start, end):
    """Downloads, cleans, and engineers features for a single stock."""
    print(f"\nProcessing {ticker}...")
    stock_info = yf.Ticker(ticker)
    data = stock_info.history(start=start, end=end)
    currency = stock_info.info.get('currency', '$')

    if data.empty or len(data) < seq_len + 5:
        print(f"No/insufficient data for {ticker} in the given range. Skipping.")
        return None, None, None, None, None, None, None

    data = data[['Open', 'High', 'Low', 'Close', 'Volume']].dropna()

    # --- Outlier Removal using Z-Score on Daily Returns ---
    data['Daily_Return'] = data['Close'].pct_change()
    window = 252
    rolling_mean = data['Daily_Return'].rolling(window=window, min_periods=1).mean()
    rolling_std = data['Daily_Return'].rolling(window=window, min_periods=1).std()
    data['Z_Score'] = (data['Daily_Return'] - rolling_mean) / rolling_std
    data = data[data['Z_Score'].abs() <= 3.0]
    data = data.drop(columns=['Daily_Return', 'Z_Score'])

    data['sma_10'] = ta.trend.SMAIndicator(close=data['Close'], window=10).sma_indicator()
    data['ema_10'] = ta.trend.EMAIndicator(close=data['Close'], window=10).ema_indicator()
    data['rsi'] = ta.momentum.RSIIndicator(close=data['Close'], window=14).rsi()
    data['macd'] = ta.trend.MACD(close=data['Close']).macd()
    data['bb_bbm'] = ta.volatility.BollingerBands(close=data['Close']).bollinger_mavg()
    data['day_of_week'] = data.index.dayofweek

    data.dropna(inplace=True)
    if data.empty or len(data) < seq_len:
        print(f"Not enough data for {ticker} after feature engineering. Skipping.")
        return None, None, None, None, None, None, None

    feature_scaler = MinMaxScaler()
    close_scaler = MinMaxScaler()
    scaled_data = feature_scaler.fit_transform(data)
    close_scaler.fit(data[['Close']])

    X, y = [], []
    for i in range(seq_len, len(scaled_data)):
        X.append(scaled_data[i-seq_len:i])
        y.append(scaled_data[i, data.columns.get_loc('Close')])

    if not X:
        print(f"Not enough data to create sequences for {ticker}. Skipping.")
        return None, None, None, None, None, None, None

    stock_ids = [stock_to_id[ticker]] * len(X)
    
    return np.array(X), np.array(y), np.array(stock_ids), close_scaler, currency, data, scaled_data

# ===== 3. Collate Data from All Tickers =====
all_X, all_y, all_stock_ids = [], [], []
scalers = {}
currencies = {}

print("--- Starting Data Collection and Processing ---")
for ticker in tickers:
    X, y, ids, scaler, currency, _, _ = process_single_ticker(ticker, start_date, end_date)
    if X is not None:
        all_X.append(X)
        all_y.append(y)
        all_stock_ids.append(ids)
        scalers[ticker] = scaler
        currencies[ticker] = currency

X_combined = np.concatenate(all_X, axis=0)
y_combined = np.concatenate(all_y, axis=0)
stock_ids_combined = np.concatenate(all_stock_ids, axis=0)

print(f"\n--- Data Processing Complete ---")
print(f"Total sequences from {len(tickers)} stocks: {len(X_combined)}")


# ===== 4. Data Splitting & DataLoader Creation =====
indices = np.arange(X_combined.shape[0])
np.random.shuffle(indices)

X_shuffled = X_combined[indices]
y_shuffled = y_combined[indices]
stock_ids_shuffled = stock_ids_combined[indices]

train_size = int(0.8 * len(X_shuffled))
X_train, y_train, ids_train = X_shuffled[:train_size], y_shuffled[:train_size], stock_ids_shuffled[:train_size]
X_test, y_test, ids_test = X_shuffled[train_size:], y_shuffled[train_size:], stock_ids_shuffled[train_size:]

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(ids_train, dtype=torch.long),
                              torch.tensor(y_train, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                             torch.tensor(ids_test, dtype=torch.long),
                             torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
print(f"Data split into {len(X_train)} training and {len(X_test)} testing samples.")


# ===== 5. Model Definition: LSTM + Transformer with Stock Embedding =====
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class LSTMTransformer(nn.Module):
    def __init__(self, input_dim, num_stocks, model_dim=64, lstm_hidden=64, n_heads=4, num_layers=2, embedding_dim=16):
        super().__init__()
        self.stock_embedding = nn.Embedding(num_stocks, embedding_dim)
        self.input_proj = nn.Linear(input_dim + embedding_dim, model_dim)
        # Using 2 layers of LSTM for more complex patterns
        self.lstm = nn.LSTM(model_dim, lstm_hidden, batch_first=True, num_layers=2, dropout=0.1)
        self.pos_enc = PositionalEncoding(lstm_hidden)
        encoder_layer = nn.TransformerEncoderLayer(d_model=lstm_hidden, nhead=n_heads, batch_first=True, activation='gelu')
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(lstm_hidden, 1)

    def forward(self, x, stock_id):
        stock_emb = self.stock_embedding(stock_id)
        stock_emb = stock_emb.unsqueeze(1).repeat(1, x.size(1), 1)
        x = torch.cat([x, stock_emb], dim=2)
        x = self.input_proj(x)
        x, _ = self.lstm(x)
        x = self.pos_enc(x)
        x = self.transformer(x)
        out = self.fc(x[:, -1, :])
        return out


# ===== 6. Model Training =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

model = LSTMTransformer(
    input_dim=X_train.shape[2],
    num_stocks=len(tickers),
    model_dim=128,
    lstm_hidden=128,
    n_heads=8,
    num_layers=3,
    embedding_dim=32
).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

epochs = 30
print(f"--- Starting training for {epochs} epochs ---")

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for xb, id_b, yb in train_loader:
        xb, id_b, yb = xb.to(device), id_b.to(device), yb.to(device).unsqueeze(1)
        optimizer.zero_grad()
        pred = model(xb, id_b)
        loss = criterion(pred, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_train_loss += loss.item()

    scheduler.step()
    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1:2d}/{epochs} | Train Loss: {avg_train_loss:.6f} | LR: {scheduler.get_last_lr()[0]:.6f}")

print("--- Training Complete ---")


# ===== 7. Evaluation on the Entire Test Set =====
model.eval()
all_preds, all_actuals, all_eval_ids = [], [], []
with torch.no_grad():
    for xb, id_b, yb in test_loader:
        xb, id_b = xb.to(device), id_b.to(device)
        pred = model(xb, id_b).cpu().numpy()
        all_preds.extend(pred.flatten())
        all_actuals.extend(yb.numpy().flatten())
        all_eval_ids.extend(id_b.cpu().numpy().flatten())

unscaled_preds_list, unscaled_actuals_list = [], []
for i in range(len(all_preds)):
    stock_id = all_eval_ids[i]
    ticker = id_to_stock[stock_id]
    scaler = scalers.get(ticker)
    if scaler:
        pred_val = scaler.inverse_transform(np.array([[all_preds[i]]]))[0, 0]
        actual_val = scaler.inverse_transform(np.array([[all_actuals[i]]]))[0, 0]
        unscaled_preds_list.append(pred_val)
        unscaled_actuals_list.append(actual_val)

unscaled_preds = np.array(unscaled_preds_list)
unscaled_actuals = np.array(unscaled_actuals_list)

r2 = r2_score(unscaled_actuals, unscaled_preds)
explained_variance = explained_variance_score(unscaled_actuals, unscaled_preds)
mae = mean_absolute_error(unscaled_actuals, unscaled_preds)
rmse = np.sqrt(mean_squared_error(unscaled_actuals, unscaled_preds))
mape = mean_absolute_percentage_error(unscaled_actuals, unscaled_preds) * 100
smape_numerator = np.abs(unscaled_preds - unscaled_actuals)
smape_denominator = (np.abs(unscaled_actuals) + np.abs(unscaled_preds)) / 2
smape_mask = smape_denominator != 0
smape = np.mean(smape_numerator[smape_mask] / smape_denominator[smape_mask]) * 100 if np.any(smape_mask) else 0.0

print("\n--- Overall Model Evaluation on Test Set ---")
print(f"  Goodness of Fit:")
print(f"    R-squared (R²):                 {r2:.4f}")
print(f"    Explained Variance:             {explained_variance:.4f}")
print(f"\n  Average Error:")
print(f"    Mean Absolute Error (MAE):      {mae:.4f}")
print(f"    Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"    Mean Absolute % Error (MAPE):   {mape:.2f}%")
print(f"    Symmetric MAPE (SMAPE):         {smape:.2f}%")


# ===== 8. Prediction Function for a Specific Stock =====
def predict_next_day(ticker):
    """Fetches data and predicts the next day's close price for a given stock."""
    print(f"\n--- Generating Prediction for {ticker.upper()} ---")
    model.eval()

    if ticker not in stock_to_id:
        print(f"Error: Model was not trained on {ticker}. Please choose from: {list(tickers)}")
        return

    fetch_start = datetime.now() - timedelta(days=365 * 3)
    
    processed_data = process_single_ticker(ticker, fetch_start, datetime.now())
    
    if processed_data[0] is None:
        return
    
    X_hist, y_hist, _, scaler, currency, data_df, full_scaled_data = processed_data
    
    # --- 1. Generate predictions for the historical data for plotting comparison ---
    historical_X_tensor = torch.tensor(X_hist, dtype=torch.float32).to(device)
    historical_id_tensor = torch.tensor([stock_to_id[ticker]] * len(X_hist), dtype=torch.long).to(device)
    with torch.no_grad():
        historical_preds_scaled = model(historical_X_tensor, historical_id_tensor).cpu().numpy().flatten()
    
    historical_preds_unscaled = scaler.inverse_transform(historical_preds_scaled.reshape(-1, 1)).flatten()
    historical_actuals_unscaled = scaler.inverse_transform(y_hist.reshape(-1, 1)).flatten()

    # --- 2. Generate the TRUE future prediction ---
    sequence_for_tomorrow = full_scaled_data[-seq_len:]
    future_seq_tensor = torch.tensor([sequence_for_tomorrow], dtype=torch.float32).to(device)
    future_id_tensor = torch.tensor([stock_to_id[ticker]], dtype=torch.long).to(device)

    with torch.no_grad():
        future_pred_scaled = model(future_seq_tensor, future_id_tensor).cpu().numpy()[0, 0]

    predicted_price_tomorrow = scaler.inverse_transform(np.array([[future_pred_scaled]]))[0, 0]
    last_actual_price = historical_actuals_unscaled[-1]
    
    print(f"Last Actual Close Price: {currency}{last_actual_price:.2f}")
    print(f"Predicted Next Day's Close Price: {currency}{predicted_price_tomorrow:.2f}")

    # --- 3. Plotting ---
    plt.style.use('seaborn-v0_8-darkgrid')
    plt.figure(figsize=(15, 7))
    
    date_range = data_df.index[seq_len:]

    plt.plot(date_range, historical_actuals_unscaled, label='Actual Prices', color='dodgerblue', linewidth=2)
    plt.plot(date_range, historical_preds_unscaled, label='Predicted Prices (Historical)', color='darkorange', linestyle='--', linewidth=1.5)
    
    future_date = date_range[-1] + timedelta(days=1)
    plt.plot(future_date, predicted_price_tomorrow, 'go', markersize=12, label=f'Predicted Next Close ({predicted_price_tomorrow:.2f})', zorder=5)
    
    plt.title(f"{ticker.upper()} Price: 3-Year Historical Comparison & Next Day Prediction (LSTM-Transformer)", fontsize=16)
    plt.xlabel("Date", fontsize=12)
    plt.ylabel(f"Price ({currency})", fontsize=12)
    plt.legend(fontsize=11)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.show()

# ===== 9. Generate Predictions for All Trained Stocks =====
print("\n--- Generating predictions for all trained stocks ---")
for ticker in tickers:
    try:
        predict_next_day(ticker)
    except Exception as e:
        print(f"An error occurred while predicting for {ticker}: {e}")

print("\n--- All predictions complete. ---")
