#### 01 — GMF Master Pipeline (Scripts-Driven)

This notebook orchestrates the full pipeline using the modular scripts created:

1) Data fetch → `scripts/fetch_data.py`  
2) EDA & stationarity → `scripts/eda.py`  
3) ARIMA model → `scripts/model_arima.py`  
4) LSTM model → `scripts/model_lstm.py`  
5) Portfolio optimization → `scripts/portfolio.py`  
6) Backtesting → `scripts/backtest.py`

**Date range:** 2015-07-01 → 2025-07-31  
**Assets:** TSLA, SPY, BND


##### 0) Setup & constants

In [None]:
from pathlib import Path
import sys, os
import pandas as pd
import numpy as np

# Make sure the 'scripts' directory is importable
scripts_path = Path('scripts')
if scripts_path.exists():
    sys.path.append(str(scripts_path.resolve()))

# Create output folders
Path('data').mkdir(exist_ok=True)
Path('results/plots').mkdir(parents=True, exist_ok=True)
Path('results/forecasts').mkdir(parents=True, exist_ok=True)
Path('results/backtests').mkdir(parents=True, exist_ok=True)

TICKERS = ['TSLA', 'SPY', 'BND']
START = '2015-07-01'
END = '2025-07-31'
TRAIN_END = '2023-12-31'
TEST_START = '2024-01-01'
BACKTEST_START = '2024-08-01'
BACKTEST_END = '2025-07-31'

print('Environment ready.')


##### 1) Fetch data with yfinance (via `scripts/fetch_data.py`)

In [None]:
from fetch_data import download_data, save_data

all_data = download_data(TICKERS, start=START, end=END)
save_data(all_data, path='data/')

print('Saved CSVs to data/:', [f'data/{t}.csv' for t in TICKERS])


##### 2) Load datasets and quick sanity checks

In [None]:
tsla = pd.read_csv('data/TSLA.csv', index_col=0, parse_dates=True)
spy  = pd.read_csv('data/SPY.csv', index_col=0, parse_dates=True)
bnd  = pd.read_csv('data/BND.csv', index_col=0, parse_dates=True)

# Ensure expected columns
for name, df in [('TSLA', tsla), ('SPY', spy), ('BND', bnd)]:
    assert 'Close' in df.columns, f"{name} CSV missing Close column"

# Merge for later use
prices = pd.concat([tsla['Close'], spy['Close'], bnd['Close']], axis=1)
prices.columns = ['TSLA','SPY','BND']
prices = prices.dropna()
prices.head()


##### 3) EDA & stationarity (via `scripts/eda.py`)

In [None]:
from eda import plot_rolling_stats, test_stationarity
import matplotlib.pyplot as plt

# Rolling stats & ADF on TSLA
plot_rolling_stats(tsla['Close'], window=30)
is_stat = test_stationarity(tsla['Close'])
print('TSLA Close stationary:', is_stat)

# Save a simple closing price plot
plt.figure(figsize=(12,5))
plt.plot(prices.index, prices['TSLA'], label='TSLA')
plt.plot(prices.index, prices['SPY'], label='SPY')
plt.plot(prices.index, prices['BND'], label='BND')
plt.title('Closing Prices (2015-07-01 to 2025-07-31)')
plt.legend()
plt.tight_layout()
plt.savefig('results/plots/closing_prices.png')
plt.show()


##### 4) ARIMA modeling (via `scripts/model_arima.py`)

In [None]:
from model_arima import train_arima, forecast_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Train/test split on TSLA
tsla_full = tsla['Close'].copy().asfreq('B')
tsla_full = tsla_full.interpolate()

train_tsla = tsla_full.loc[:TRAIN_END]
test_tsla  = tsla_full.loc[TEST_START:]

# Train ARIMA (on log-transformed series to stabilize variance)
train_log = np.log(train_tsla.dropna())
arima_model = train_arima(train_log)

# Forecast length equals test set length
n_periods = len(test_tsla)
fc_log = forecast_arima(arima_model, periods=n_periods)
fc = np.exp(fc_log)  # invert log

# Evaluate
mae = mean_absolute_error(test_tsla, fc)
rmse = mean_squared_error(test_tsla, fc, squared=False)
print('ARIMA Test MAE:', mae)
print('ARIMA Test RMSE:', rmse)

# Save forecast
pd.Series(fc, index=test_tsla.index).to_csv('results/forecasts/tsla_arima_test_forecast.csv')
print('Saved ARIMA test forecast -> results/forecasts/tsla_arima_test_forecast.csv')


##### 5) LSTM modeling (via `scripts/model_lstm.py`)

In [9]:
from model_lstm import prepare_lstm_data, build_lstm_model
import matplotlib.pyplot as plt

# Prepare data
series = tsla_full  # business-day frequency, interpolated
X, y, scaler = prepare_lstm_data(series, look_back=60)
split_idx = int(len(X) * 0.8)

X_train, y_train = X[:split_idx], y[:split_idx]
X_test, y_test = X[split_idx:], y[split_idx:]

# Build & train
model = build_lstm_model((X_train.shape[1], 1))
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.1, verbose=0)

# Predict
pred_scaled = model.predict(X_test).flatten()
pred = scaler.inverse_transform(pred_scaled.reshape(-1,1)).flatten()

true = scaler.inverse_transform(y_test.reshape(-1,1)).flatten()

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae_lstm = mean_absolute_error(true, pred)
rmse_lstm = mean_squared_error(true, pred, squared=False)
print('LSTM Test MAE:', mae_lstm)
print('LSTM Test RMSE:', rmse_lstm)

# Plot predictions vs truth (last portion)
plt.figure(figsize=(12,5))
plt.plot(true, label='True')
plt.plot(pred, label='LSTM Pred')
plt.title('LSTM Test Predictions (scaled back to price)')
plt.legend()
plt.tight_layout()
plt.savefig('results/plots/tsla_lstm_test.png')
plt.show()


NameError: name 'tsla_full' is not defined

##### 6) Portfolio optimization (via `scripts/portfolio.py`)

In [None]:
from portfolio import optimize_portfolio

weights = optimize_portfolio(prices)
print('Tangency (Max Sharpe) Weights:', weights)

# Save weights
pd.Series(weights).to_csv('results/forecasts/weights_tangency.csv')
print('Saved weights -> results/forecasts/weights_tangency.csv')


##### 7) Backtesting (via `scripts/backtest.py`)

In [None]:
from backtest import calculate_returns, backtest_strategy
import matplotlib.pyplot as plt

# Restrict to backtest window
bt_prices = prices.loc[BACKTEST_START:BACKTEST_END]
bt_returns = calculate_returns(bt_prices)

# Use optimized weights from above
w_series = pd.read_csv('results/forecasts/weights_tangency.csv', index_col=0, header=None, squeeze=True)
# For pandas >=1.5, squeeze arg is deprecated; handle accordingly
if not hasattr(w_series, 'index') or w_series.shape[1] if hasattr(w_series, 'shape') and len(w_series.shape)>1 else False:
    w_series = pd.read_csv('results/forecasts/weights_tangency.csv', index_col=0, header=None)[1]

# Align order
w = [w_series.get('TSLA', 0.0), w_series.get('SPY', 0.0), w_series.get('BND', 0.0)]

cum_strategy = backtest_strategy(bt_returns, w)

# Benchmark 60/40 SPY/BND
bench_w = [0.0, 0.6, 0.4]
cum_bench = backtest_strategy(bt_returns, bench_w)

# Plot
plt.figure(figsize=(12,5))
plt.plot(cum_strategy.index, cum_strategy.values, label='Strategy')
plt.plot(cum_bench.index, cum_bench.values, label='60/40 Benchmark')
plt.title('Cumulative Returns (Backtest)')
plt.legend()
plt.tight_layout()
plt.savefig('results/backtests/cumulative_returns.png')
plt.show()

# Simple performance summary
def annualized_sharpe(returns, rf=0.0):
    mean = returns.mean() * 252
    std = returns.std() * (252 ** 0.5)
    return (mean - rf) / std

print('Strategy Sharpe:', annualized_sharpe(bt_returns @ pd.Series(w, index=['TSLA','SPY','BND'])))
print('Benchmark Sharpe:', annualized_sharpe(bt_returns @ pd.Series(bench_w, index=['TSLA','SPY','BND'])))

# Save curves
cum_df = pd.DataFrame({'Strategy': cum_strategy, 'Benchmark_60_40': cum_bench})
cum_df.to_csv('results/backtests/cumulative_returns.csv')
print('Saved cumulative returns -> results/backtests/cumulative_returns.csv')
