<a href="https://colab.research.google.com/github/michaelHalloran21/Stock_Prediction_with_Machine_Learning/blob/main/Team_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Required libraries
!pip install yfinance pandas numpy matplotlib seaborn scikit-learn quantstats tqdm vectorbt


Collecting vectorbt
  Downloading vectorbt-0.27.1-py3-none-any.whl.metadata (12 kB)
Collecting dill (from vectorbt)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting dateparser (from vectorbt)
  Downloading dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting schedule (from vectorbt)
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Collecting mypy_extensions (from vectorbt)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading vectorbt-0.27.1-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.5/527.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.9-py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m11

In [None]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import quantstats as qs
import vectorbt as vbt

from datetime import datetime
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Create a folder for data storage
data_folder = "stock_data"
os.makedirs(data_folder, exist_ok=True)

In [None]:
# Function to download stock data
def download_data(ticker, start, end):
    """Download stock data for a specific ticker."""
    try:
        data = yf.download(ticker, start=start, end=end)
        if 'Adj Close' not in data.columns:
            data['Adj Close'] = data['Close']  # Fallback to Close if Adj Close is missing
        data['Ticker'] = ticker  # Add ticker column for identification
        data['Log Returns'] = np.log(data['Adj Close'] / data['Adj Close'].shift(1))
        return data.dropna()
    except Exception as e:
        print(f"Error downloading data for {ticker}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on failure

In [None]:
# Download and combine data
def download_and_combine_data(tickers, start, end):
    """Download stock data for all tickers and combine them into one DataFrame."""
    combined_data = []
    for ticker in tqdm(tickers, desc="Downloading stock data"):
        data = download_data(ticker, start, end)
        if not data.empty:
            combined_data.append(data)
    return pd.concat(combined_data) if combined_data else pd.DataFrame()

In [None]:
# Fetch S&P 500 tickers
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp500_tickers = pd.read_html(url)[0]['Symbol'].tolist()

In [None]:
# Define date range
start_date = "2015-01-01"
end_date = datetime.now().strftime("%Y-%m-%d")

In [None]:
# Download and save data
print("Downloading stock data...")
stock_data = download_and_combine_data(sp500_tickers[:50], start_date, end_date)  # Limit to 50 for demo
if not stock_data.empty:
    stock_data.to_csv("stock_data/combined_stock_data.csv", index=False)
    print("Data saved to 'stock_data/combined_stock_data.csv'")
else:
    print("No data downloaded. Exiting.")
    exit()

Downloading stock data...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Data saved to 'stock_data/combined_stock_data.csv'


In [None]:
stock_data.columns = [
    '_'.join([str(level) for level in col if level])
    for col in stock_data.columns
]

In [None]:
# Ensure no duplicate column names
if stock_data.columns.duplicated().any():
    print("Duplicate column names found. Renaming...")
    stock_data.columns = pd.io.parsers.ParserBase({'names': stock_data.columns})._maybe_dedup_names(stock_data.columns)


In [None]:
def add_features(stock_data):
    # Compute new features
    sma_10 = stock_data['Adj Close'].rolling(window=10).mean()
    sma_50 = stock_data['Adj Close'].rolling(window=50).mean()
    ema_12 = stock_data['Adj Close'].ewm(span=12, adjust=False).mean()
    ema_26 = stock_data['Adj Close'].ewm(span=26, adjust=False).mean()
    macd = ema_12 - ema_26

    # Calculate RSI
    delta = stock_data['Adj Close'].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    # Combine all features into a single DataFrame
    features = pd.DataFrame({
        'SMA_10': sma_10,
        'SMA_50': sma_50,
        'EMA_12': ema_12,
        'EMA_26': ema_26,
        'MACD': macd,
        'RSI': rsi
    }, index=stock_data.index)

    # Concatenate the new features with the original DataFrame
    stock_data = pd.concat([stock_data, features], axis=1)

    return stock_data

In [None]:
# Add features
stock_data = add_features(stock_data)

In [None]:
# Add labels for machine learning
def define_labels(data):
    """Create target labels for training."""
    data['Target'] = np.where(data['Log Returns'].shift(-1) > 0, 1, 0)
    return data

In [None]:
# Add labels
stock_data = define_labels(stock_data)

In [None]:
# Prepare data for training
features = ['SMA_10', 'SMA_50', 'RSI', 'MACD']
# Drop rows with NaN in 'Log Returns'
stock_data = stock_data.dropna(subset=['Log Returns'])
# Drop NaN values across features and target to ensure alignment
data = stock_data[features + ['Target']].dropna()

In [None]:
# Separate features and target
X = data[features]
y = data['Target']

In [None]:
# Check alignment
assert len(X) == len(y), "X and y are not aligned!"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
stock_data = stock_data.loc[X.index]  # Align stock_data to training/testing indices

In [None]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Ensure predictions have a unique index
predictions = pd.Series(y_pred, index=X_test.index, name="Predicted_Signal")

In [None]:
# Ensure predictions have a unique index
if not predictions.index.is_unique:
    print("Duplicate indices found in predictions. Dropping duplicates...")
    predictions = predictions.loc[~predictions.index.duplicated(keep='first')]

In [None]:
# Align predictions index with stock_data
aligned_indices = predictions.index.intersection(stock_data.index)
predictions = predictions.loc[aligned_indices]
stock_data = stock_data.loc[aligned_indices]


In [None]:
# Backtest
def backtest(stock_data, predictions):
    predictions.index = stock_data.loc[X_test.index].index  # Align predictions with stock_data

    backtest_data = stock_data.loc[predictions.index].copy()  # Align indices

    # Strategy returns based on predictions
    backtest_data['Strategy'] = predictions.shift(1) * backtest_data['Log Returns']

    # Cumulative returns
    backtest_data['Cumulative Market Returns'] = (1 + backtest_data['Log Returns']).cumprod()
    backtest_data['Cumulative Strategy Returns'] = (1 + backtest_data['Strategy']).cumprod()

    # Plot cumulative returns
    plt.figure(figsize=(12, 6))
    plt.plot(backtest_data.index, backtest_data['Cumulative Strategy Returns'], label='Strategy', color='blue')
    plt.plot(backtest_data.index, backtest_data['Cumulative Market Returns'], label='Market', color='orange')
    plt.title('Backtest: Strategy vs. Market')
    plt.xlabel('Date')
    plt.ylabel('Cumulative Returns')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
assert len(stock_data.loc[predictions.index]) == len(predictions), "Length mismatch between stock_data and predictions!"

In [None]:
# Run the updated backtest
backtested_data = backtest(stock_data, predictions)

In [None]:
# Use Quantstats for advanced analysis
if 'Strategy_Returns' in stock_data.columns:
    qs.reports.full(stock_data['Strategy_Returns'].dropna())
else:
    print("No Strategy_Returns found in stock_data for Quantstats analysis.")