In [3]:
# ---------------------------------------------------------------------------------
# PLEASE NOTE:
# 1) This is a simple demonstration of a daily backtest using a "dumb" MACD+RSI+Oscillator signal.
# 2) The code downloads historical data for a given ticker (default: AAPL) from Yahoo Finance.
# 3) No advanced error handling or transaction cost modeling is included here.
# ---------------------------------------------------------------------------------

import yfinance as yf  # For downloading Yahoo Finance data
import pandas as pd
import numpy as np

# --------------------
#  DOWNLOAD HISTORICAL DATA FROM YAHOO FINANCE
# --------------------
ticker = "AAPL"       # Replace with your desired ticker
start_date = "2020-01-01"
end_date = "2021-01-01"
df = yf.download(ticker, start=start_date, end=end_date)

# Ensure the DataFrame has the expected columns: 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'
# We'll work with 'Close', 'High', and 'Low' for our indicators.

# ---------------------------------------------------------------------------------
#  HELPER FUNCTIONS: MACD, RSI, STOCHASTIC OSCILLATOR
# ---------------------------------------------------------------------------------

def compute_MACD(data, short_window=12, long_window=26, signal_window=9):
    """
    Compute MACD (Moving Average Convergence Divergence).
    MACD = EMA(short_window) - EMA(long_window)
    signal = EMA(MACD, signal_window)
    """
    short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
    long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
    macd_line = short_ema - long_ema
    signal_line = macd_line.ewm(span=signal_window, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return macd_line, signal_line, macd_hist

def compute_RSI(data, period=14):
    """
    Compute the Relative Strength Index (RSI).
    RSI < 30 => oversold
    RSI > 70 => overbought
    """
    delta = data['Close'].diff()
    gain = np.where(delta > 0, delta, 0.0)
    loss = np.where(delta < 0, -delta, 0.0)
    avg_gain = pd.Series(gain).rolling(window=period).mean()
    avg_loss = pd.Series(loss).rolling(window=period).mean()
    rs = avg_gain / (avg_loss + 1e-10)  # Prevent division by zero
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

def compute_Stoch_Osc(data, k_period=14, d_period=3):
    """
    Compute the Stochastic Oscillator.
    %K = (Close - LowestLow) / (HighestHigh - LowestLow) * 100
    %D = SMA of %K over d_period
    """
    low_min = data['Low'].rolling(k_period).min()
    high_max = data['High'].rolling(k_period).max()
    stoch_k = 100 * (data['Close'] - low_min) / (high_max - low_min + 1e-10)
    stoch_d = stoch_k.rolling(d_period).mean()
    return stoch_k, stoch_d


[*********************100%***********************]  1 of 1 completed


In [4]:

# ----------------------------------
#  CALCULATE INDICATORS
# ----------------------------------
df['MACD_Line'], df['MACD_Signal'], df['MACD_Hist'] = compute_MACD(df)
df['RSI'] = compute_RSI(df)
df['Stoch_K'], df['Stoch_D'] = compute_Stoch_Osc(df)

# ---------------------------------------------------------------------------------
#  STRATEGY LOGIC (DUMB EXAMPLE):
#  1) BUY condition:
#      - MACD_Line > MACD_Signal (simple bullish MACD crossover)
#      - RSI < 70 (avoid overbought conditions)
#      - Stoch_K < 80 (avoid overbought conditions, dumb threshold)
#  2) SELL condition:
#      - Otherwise, be flat (no position).
# ---------------------------------------------------------------------------------

# Initialize columns for signals and positions
df['Signal'] = 0
df['Position'] = 0

for i in range(1, len(df)):
    # "Dumb" buy rule for day i
    if (df['MACD_Line'].iloc[i] > df['MACD_Signal'].iloc[i]) \
       and (df['RSI'].iloc[i] < 70) \
       and (df['Stoch_K'].iloc[i] < 80):
        # Buy signal = 1
        df.at[df.index[i], 'Signal'] = 1
    else:
        # Sell/Flat signal = 0
        df.at[df.index[i], 'Signal'] = 0

# For simplicity, we assume the strategy is either fully invested (1) or flat (0).
# We use the previous day's signal to set today's position.
df['Position'] = df['Signal'].shift(1).fillna(0)

# ---------------------------------------------------------------------------------
#  CALCULATE RETURNS
# ---------------------------------------------------------------------------------
df['Market_Return'] = df['Close'].pct_change()
df['Strategy_Return'] = df['Position'] * df['Market_Return']

# Compute cumulative returns
df['Cumulative_Market'] = (1 + df['Market_Return']).cumprod() - 1
df['Cumulative_Strategy'] = (1 + df['Strategy_Return']).cumprod() - 1

# ---------------------------------------------------------------------------------
#  OUTPUT PERFORMANCE
# ---------------------------------------------------------------------------------
final_market_return = df['Cumulative_Market'].iloc[-1] * 100
final_strategy_return = df['Cumulative_Strategy'].iloc[-1] * 100

print(f"Final Market Return     : {final_market_return:.2f}%")
print(f"Final Strategy Return   : {final_strategy_return:.2f}%")

# Optional: Inspect final few rows of the DataFrame
print(df.tail(10))

# ---------------------------------------------------------------------------------
# END OF DEMO
# ---------------------------------------------------------------------------------


ValueError: Data must be 1-dimensional, got ndarray of shape (253, 1) instead

In [None]:
# --- Required Libraries ---
import yfinance as yf
import pandas as pd
import numpy as np
import ta # Technical Analysis library: https://github.com/bukosabino/ta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

print(f"PyTorch version: {torch.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")
# print(f"TA library version: {ta.__version__}") # Removed as requested
print(f"yfinance version: {yf.__version__}")

# --- Configuration ---
TICKER = 'AAPL'         # Example Stock ticker
START_DATE = '2018-01-01'
# Use current date or a recent date for END_DATE if you want up-to-date data for prediction step
# For training/testing on historical data, keep a fixed end date
END_DATE = '2024-12-31'
PREDICTION_HORIZON = 1  # Predict 1 day ahead (simple binary up/down)
TEST_SIZE = 0.2         # 20% of data for testing
RANDOM_STATE = 42       # For reproducible train/test split

# NN Hyperparameters
INPUT_SIZE = 0          # Will be set based on number of features
HIDDEN_SIZE = 32        # Arbitrary number of neurons in hidden layer
OUTPUT_SIZE = 1         # Binary classification (Up=1, Down=0)
LEARNING_RATE = 0.001
EPOCHS = 100

# --- 1. Data Acquisition ---
print(f"\n[1] Downloading data for {TICKER}...")
try:
    # Use auto_adjust=True to get adjusted prices ('close' will be adj close)
    df = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
    if df.empty:
        raise ValueError("No data downloaded. Check ticker and date range.")
    print(f"Downloaded {len(df)} data points.")
    # yfinance with auto_adjust=True already gives lowercase column names
    # df.columns = [str(col).lower().replace(' ', '_') for col in df.columns] # Less needed now
    print("Original Columns:", df.columns)
    print(df.tail())
except Exception as e:
    print(f"Error downloading data: {e}")
    exit()

# --- 2. Feature Engineering (Add Technical Indicators) ---
print("\n[2] Calculating Technical Indicators...")

# Add TA features using the 'ta' library
# Ensure the necessary columns ('open', 'high', 'low', 'close', 'volume') exist
required_cols = ['open', 'high', 'low', 'close', 'volume']
if not all(col in df.columns for col in required_cols):
    print(f"Error: DataFrame missing required columns for TA. Found: {df.columns}. Need: {required_cols}")
    exit()

try:
    df = ta.add_all_ta_features(
        df, open="open", high="high", low="low", close="close", volume="volume", fillna=True
    )
except Exception as e:
    print(f"Error during ta.add_all_ta_features: {e}")
    print("DataFrame columns before error:", df.columns)
    exit()

# --- FIX for MultiIndex Columns and Column Cleaning ---
# Check if 'ta' library created a MultiIndex and flatten if necessary
if isinstance(df.columns, pd.MultiIndex):
    print("Detected MultiIndex columns after TA, flattening...")
    # Join levels with an underscore, ensuring all parts are strings
    df.columns = ['_'.join(map(str, col)).strip().strip('_') for col in df.columns.values] # Added strip('_')

# Clean column names (lowercase, replace invalid chars) - applied AFTER potential flattening
# Make sure this cleaning process doesn't mangle essential names like 'close'
df.columns = [str(col).lower().replace(' ', '_').replace('-', '_').replace(':', '_').replace('%', 'perc') for col in df.columns]
# --- End FIX ---

print("Columns after TA and cleaning:", df.columns) # Show columns after processing

# --- *** ADDED CHECK/FIX FOR 'close' COLUMN *** ---
# Ensure 'close' column exists after all manipulations
if 'close' not in df.columns:
    print("Warning: 'close' column not found directly after cleaning. Checking for variations...")
    # Common variations if flattening occurred (e.g., from ('Close', ''))
    potential_close_names = ['close_', '_close']
    found_close = False
    for name in potential_close_names:
        if name in df.columns:
            print(f"Found '{name}', renaming to 'close'.")
            df.rename(columns={name: 'close'}, inplace=True)
            found_close = True
            break
    if not found_close:
        print("\nFATAL Error: Cannot find 'close' column or known variations after processing.")
        print("This is needed for creating the target variable.")
        print("Available columns:", sorted(list(df.columns)))
        exit()
# --- *** END CHECK/FIX *** ---


# Manually select a smaller subset of features for simplicity
feature_columns = [
    'momentum_rsi',        # Relative Strength Index
    'trend_macd_diff',     # MACD Difference
    'volatility_bbhi',     # Bollinger Band High Indicator (1 if close > high band)
    'volatility_bbli',     # Bollinger Band Low Indicator (1 if close < low band)
    'momentum_stoch',      # Stochastic Oscillator %K
]

# Ensure selected feature columns exist after ta.add_all_ta_features() and cleaning
available_cols = [col for col in feature_columns if col in df.columns]
if len(available_cols) != len(feature_columns):
    print("\nWarning: Some selected feature columns are not available in the DataFrame.")
    missing = set(feature_columns) - set(available_cols)
    print(f"Missing or differently named features: {missing}")
    print("Re-check feature_columns list against 'Columns after TA and cleaning' printed above.")

feature_columns = available_cols # Use only available cols
INPUT_SIZE = len(feature_columns) # Update input size for NN

if INPUT_SIZE == 0:
    print("\nError: No valid feature columns selected or generated. Cannot proceed.")
    exit()

print(f"\nSelected {INPUT_SIZE} features for model: {feature_columns}")


# --- 3. Data Preparation ---
print("\n[3] Preparing data for Neural Network...")

# 3.1 Create Target Variable: Predict if next day's close is higher than today's close
# This line should now work as we ensured 'close' column exists
try:
    df['target'] = (df['close'].shift(-PREDICTION_HORIZON) > df['close']).astype(int)
except KeyError as e:
     print(f"\nFATAL KeyError: Still cannot find 'close' column right before creating target, even after checks.")
     print("Columns at point of error:", df.columns)
     # It might be useful to see the last few rows where 'close' might be missing
     print("\nTail of DataFrame before error:")
     print(df.tail())
     raise e # Re-raise the error after printing context

# 3.2 Drop rows with NaN values
original_len = len(df)
df.dropna(inplace=True)
print(f"Dropped {original_len - len(df)} rows with NaN values (from TA calculations and target shift).")

if df.empty:
    print("\nError: DataFrame is empty after dropping NaN values. Need more data or different indicators.")
    exit()

# 3.3 Select Features (X) and Target (y)
X = df[feature_columns]
y = df['target']

# 3.4 Split Data into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=False # DO NOT shuffle time series data
)
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# 3.5 Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3.6 Convert data to PyTorch Tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# --- 4. Define the Simple Neural Network ---
print("\n[4] Defining the Neural Network...")
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer_1(x)
        x = self.relu(x)
        x = self.layer_2(x)
        x = self.sigmoid(x)
        return x

model = SimpleNN(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
print(model)

# --- 5. Training the Model ---
print("\n[5] Training the Model...")
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}')

print("Training finished.")

# --- 6. Evaluate the Model (Basic Accuracy) ---
print("\n[6] Evaluating the Model on Test Data...")
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    predicted = (test_outputs > 0.5).float()
    accuracy = (predicted == y_test_tensor).float().mean()
    print(f'Test Accuracy: {accuracy.item():.4f}')
    naive_accuracy = max(y_test.mean(), 1 - y_test.mean()) if not y_test.empty else 0.5
    print(f'Naive Benchmark Accuracy (predict majority class): {naive_accuracy:.4f}')

# --- 7. "Dumb" Prediction for the Next Day (Illustrative) ---
print("\n[7] Making a 'Dumb' Prediction for the Day After Last Data Point...")
# Re-fetch or use the full dataframe state *before* dropna was called to get the last features
try:
    # Option 1: Re-create the full feature set (safer if df was modified extensively)
    df_full = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
    # df_full.columns = [str(col).lower().replace(' ', '_') for col in df_full.columns] # Less needed
    required_cols = ['open', 'high', 'low', 'close', 'volume']
    if not all(col in df_full.columns for col in required_cols):
         raise ValueError("Required columns missing in re-downloaded data for prediction.")
    df_full = ta.add_all_ta_features(df_full, open="open", high="high", low="low", close="close", volume="volume", fillna=True)
    if isinstance(df_full.columns, pd.MultiIndex):
        df_full.columns = ['_'.join(map(str, col)).strip().strip('_') for col in df_full.columns.values]
    df_full.columns = [str(col).lower().replace(' ', '_').replace('-', '_').replace(':', '_').replace('%', 'perc') for col in df_full.columns]

    # Ensure the required feature_columns exist in this df_full
    if not all(col in df_full.columns for col in feature_columns):
        missing_pred_features = set(feature_columns) - set(df_full.columns)
        raise ValueError(f"Features needed for prediction missing after re-processing: {missing_pred_features}")

    # Select the last row that has valid values for the chosen features
    last_valid_index = df_full[feature_columns].last_valid_index()

    if last_valid_index is not None:
        last_data_point = df_full.loc[[last_valid_index]][feature_columns]

        # Scale the last data point using the *same* scaler fitted on training data
        last_data_scaled = scaler.transform(last_data_point) # scaler expects a 2D array
        last_data_tensor = torch.tensor(last_data_scaled, dtype=torch.float32)

        # Make prediction
        model.eval()
        with torch.no_grad():
            prediction_prob = model(last_data_tensor)
            prediction_class = (prediction_prob > 0.5).int().item()

        print(f"Features for last available data point ({last_valid_index.date()}):")
        print(last_data_point)
        print(f"\nModel Output Probability: {prediction_prob.item():.4f}")
        print(f"Prediction for next day: {'UP' if prediction_class == 1 else 'DOWN'} ({prediction_class})")
    else:
         print("Could not find a valid last data point (index) with all required features in the re-processed data.")

except Exception as e:
    print(f"\nError during prediction phase: {e}")
    print("Could not make prediction for the next day.")


print("\nScript finished. Remember: This is a highly simplified example and NOT financial advice.")