In [None]:
import random

from sklearn.metrics import mean_squared_error
# Imports 
import matplotlib.pyplot as plt
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy as np
from tensorflow.python.ops.gen_nn_ops import LeakyRelu


In [None]:
def plot_training_history(history):
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.show()


def check_stationarity(series):
    adf_result = adfuller(series)
    print("ADF Statistic:", adf_result[0])
    print("p-value:", adf_result[1])
    if adf_result[1] <= 0.05:
        print("The series is stationary.")
    else:
        print("The series is not stationary.")

from sklearn.preprocessing import MinMaxScaler
import numpy as np

def window_generator(data, feature_columns, target_column, input_size, output_size, stride):

    X = []
    y = []

    data = data.sort_index()

    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    scaled_features = feature_scaler.fit_transform(data[feature_columns])
    scaled_target = target_scaler.fit_transform(data[target_column])

    for start in range(0, len(data) - input_size - output_size + 1, stride):
        end_input = start + input_size
        end_output = end_input + output_size

        X.append(scaled_features[start:end_input])
        y.append(scaled_target[end_input:end_output])

    return np.array(X), np.array(y), feature_scaler, target_scaler



def test_window_generator(test_data, feature_columns, target_column, input_size, output_size, stride, feature_scaler, target_scaler):
    X_test = []
    y_test = []

    test_data = test_data.sort_index()

    scaled_features = feature_scaler.transform(test_data[feature_columns])
    scaled_target = target_scaler.transform(test_data[target_column])

    for start in range(0, len(test_data) - input_size - output_size + 1, stride):
        end_input = start + input_size
        end_output = end_input + output_size

        X_test.append(scaled_features[start:end_input])
        y_test.append(scaled_target[end_input:end_output])

    return np.array(X_test), np.array(y_test)



In [None]:
with open("tickers.txt", "r") as file:
    tickers = [line.strip() for line in file]

output_file = "sp500_data.csv"
with open(output_file, "w") as f:
    f.write("Ticker,Date, Volume,Price_Change\n")

for ticker in tickers:
    try:
        print(f"Fetching data for {ticker}...")
        stock = yf.Ticker(ticker)
        index = stock.history(start="2010-01-01", end="2020-01-01")

        if index.empty:
            print(f"No data for {ticker}. Skipping...")
            continue

        expected_days = 252 * 9  # 10 years of trading data
        if len(index) < expected_days:
            print(f"Incomplete data for {ticker} ({len(index)} days). Skipping...")
            continue

        index['Price_Change'] = index['Close'].pct_change()

        index.reset_index(inplace=True)
        index['Ticker'] = ticker
        index[['Ticker', 'Date', 'Volume', 'Price_Change']].to_csv(
            output_file, mode='a', header=False, index=False
        )

    except Exception as e:
        print(f"Failed to fetch data for {ticker}: {e}")
print(f"Data collection complete! Saved to {output_file}.")

In [None]:
clean_data = pd.read_csv("sp500_data.csv")
print(clean_data.shape)
missing_summary = clean_data.isnull().sum()
print(missing_summary[missing_summary > 0])

# Get unique tickers from the 'Ticker' column
unique_tickers = clean_data['Ticker'].unique()

# Count the number of unique tickers
print(f"Number of unique tickers: {len(unique_tickers)}")

#X = clean_data.drop(columns=['Price_Change'])
#y = clean_data['Price_Change']
#print(X.shape, y.shape)
#print(X.head())
#print(y.head())

In [None]:

# Download data
data = yf.download("^GSPC", start="2010-01-01", end="2020-01-01")

# Select relevant columns
index_data = data[["Volume", "Close"]]

# Calculate Price Change
index_data['Price_Change'] = index_data['Close'].pct_change()

# Reset the MultiIndex columns
index_data.columns = ['Volume', 'Close', 'Price_Change']

# Check for rows with NaN values
print("Rows with NaN values:")
print(index_data[index_data.isna().any(axis=1)])

# Drop rows with NaN values
index_data = index_data.dropna(how='any')  # Drops rows where any column has NaN

# Save cleaned data to CSV
index_data.to_csv("sp500_index_volume.csv", mode='w', header=True, index=True)

# Display the cleaned data
print("Cleaned Data:")
print(index_data.head())

In [None]:
from matplotlib.ticker import FuncFormatter


def format_large_values(x, pos):
    return f'{int(x):,}$'

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(18, 12), sharex=True)

ax1.plot(index_data.index, index_data['Volume'].values, label="Volume", linewidth=0.8, color='blue')
ax1.set_title("S&P 500 Index Volume (2010-2020)", fontsize=14)
ax1.set_ylabel("Volume", fontsize=12)
ax1.grid(True, linestyle="--", alpha=0.6)
ax1.yaxis.set_major_formatter(FuncFormatter(format_large_values))
ax1.axhline(y=index_data['Volume'].values.mean(), color='red', linestyle='--', label="Mean Volume Over Period")
ax1.legend()

ax2.plot(index_data.index, index_data['Close'].values, label="Close Price", linewidth=0.8, color='green')
ax2.axhline(y=index_data['Close'].values.mean(), color='red', linestyle='--', label="Mean Volume Over Period")
ax2.set_title("S&P 500 Index Close Price (2010-2020)", fontsize=14)
ax2.set_xlabel("Date", fontsize=12)
ax2.set_ylabel("Close Price", fontsize=12)
ax2.grid(True, linestyle="--", alpha=0.6)
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
index_data.shape, clean_data.shape

In [None]:
ticker_data = clean_data.pivot(index='Date', columns='Ticker', values=[' Volume', 'Price_Change'])

ticker_data.columns = ['_'.join(col).strip() for col in ticker_data.columns.values]

ticker_data = ticker_data.reset_index()

ticker_data['Date'] = pd.to_datetime(ticker_data['Date'], errors='coerce')  # Handle malformed dates gracefully


ticker_data = ticker_data.set_index('Date')
ticker_data.head(2)

ticker_data = ticker_data[~ticker_data.index.isna()]

ticker_data.to_csv("ticker_data.csv")

ticker_data.index = pd.to_datetime(ticker_data.index, utc=True)
ticker_data.index = ticker_data.index.date
ticker_data.dropna(how='any', inplace=True)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(index_data["Volume"], index_data["Price_Change"], alpha=0.5)
plt.title("Volume vs. Price Change (%)", fontsize=14)
plt.xlabel("Volume", fontsize=12)
plt.ylabel("Price Change (%)", fontsize=12)
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

High Volume and High Price Change:

Outliers on the top-right or bottom-right of the graph (large volume with significant price changes) could indicate market-moving events.
Market Stability:

The concentration of points near the origin suggests that most days see stable prices and moderate trading activity.
Event-Driven Volatility:

The outliers (large price changes or unusually high volumes) may help you identify event-driven periods that warrant further investigation.


In [None]:
index_datas = pd.read_csv('sp500_index_volume.csv')
        
check_stationarity(index_datas['Volume'].values)
check_stationarity(index_datas['Price_Change'].values)

In [None]:
X,y,feature_scaler, target_scaler = window_generator(ticker_data,ticker_data.columns ,['Price_Change_AAPL'],60,3,1)

In [None]:
X.shape, y.shape

In [None]:
print("Input shape (X):", X.shape)  # Expected: (813, 60, 1)
print("Target shape (y):", y.shape) # Expected: (813, 20, 1)

In [None]:
file_path = 'sp500_index_volume.csv'
index_volume = pd.read_csv(file_path)

index_volume['Date'] = pd.to_datetime(index_volume['Date'])
index_volume.set_index('Date', inplace=True)

biweekly_data = index_volume['Volume'].resample('1W').sum()

train_data = biweekly_data['2010-01-01':'2015-01-01']
test_data = biweekly_data['2015-01-01':'2019-12-31']

fixed_order = (1, 0, 0)
initial_model = ARIMA(train_data, order=fixed_order)
fitted_model = initial_model.fit()

predictions_recursive = []
train_data_dynamic = train_data.copy()

for i in range(len(test_data)):
    forecast = fitted_model.get_forecast(steps=1)
    predicted_value = forecast.predicted_mean.iloc[0]
    predictions_recursive.append(predicted_value)

    # Update training data with the actual value from the test set
    new_data = test_data.iloc[i:i+1]
    train_data_dynamic = pd.concat([train_data_dynamic, new_data])

    # Refit the model using the updated training data
    fitted_model = ARIMA(train_data_dynamic, order=fixed_order).fit()

# Convert predictions to a pandas Series
predictions_recursive = pd.Series(predictions_recursive, index=test_data.index)


# Evaluate predictions
mse_recursive = mean_squared_error(test_data, predictions_recursive)
print(f"Mean Squared Error (Recursive ARMA): {mse_recursive}")

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(train_data, label="Training Data")
plt.plot(test_data, label="Actual Test Data", color="green")
plt.plot(predictions_recursive, label="Recursive Predictions", color="red")
plt.title("Recursive ARMA Model Predictions (Bi-Weekly Volume)")
plt.xlabel("Date")
plt.ylabel("Volume")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(12, 6))
index_volume['Close'].plot(title="Price Change Over Time", color='blue', label='Price Change')
plt.xlabel("Date")
plt.ylabel("Close")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
predictions_recursive.pct_change()


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed, Input, LeakyReLU
from keras.optimizers import Adam


model = Sequential([
    Input(shape=(5, 882)),  # Match input shape with your data

    # First LSTM layer
    LSTM(25, activation='tanh', return_sequences=True),
    Dropout(0.2),  # Regularization

    # Second LSTM layer
    LSTM(10, activation='tanh', return_sequences=False),
    Dropout(0.2),  # Regularization

    # Fully connected output layer
    Dense(2, activation='linear')  # Output layer for 2 features
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['MAE'])
model.summary()

In [None]:
ticker_data.columns.to_list()

# Example column names
columns = ticker_data.columns.to_list()

# Extract unique tickers from column names
tickers = list(set(col.split('_')[1] for col in columns))

random_tickers = random.sample(tickers, 2)

X_columns = [f"Volume_{ticker}" for ticker in columns] + [f"Price_Change_{ticker}" for ticker in columns]
y_columns = [f"Price_Change_{ticker}" for ticker in random_tickers]

ticker_data.index = pd.to_datetime(ticker_data.index)

train_data = ticker_data.loc['2010-01-01':'2018-01-01']
test_data = ticker_data.loc['2018-01-01':'2020-01-01']
X_train, y_train , feature_scaler, target_scaler = window_generator(ticker_data['2010-01-01' : '2018-01-01'], columns, y_columns, 5, 1, 1)
X_test , y_test = test_window_generator(test_data, columns, y_columns, 5, 1, 1, feature_scaler, target_scaler)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from keras.src.callbacks import Callback
from keras.callbacks import ReduceLROnPlateau

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[lr_scheduler])

In [None]:
# Predicting y_test
y_pred = model.predict(X_test)

# Reshape y_test to match y_pred if needed
y_test_reshaped = y_test.reshape(y_test.shape[0], -1)  # Flatten to (473, 2)

# Inverse transform predictions and ground truth
y_pred_original = target_scaler.inverse_transform(y_pred)
y_test_original = target_scaler.inverse_transform(y_test_reshaped)

# Evaluate results (e.g., using Mean Squared Error)
from sklearn.metrics import mean_squared_error

# Flatten both arrays for MSE if they are still multidimensional
mse = mean_squared_error(y_test_original.flatten(), y_pred_original.flatten())
print(f"Mean Squared Error: {mse}")

# Optionally, visualize predictions
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
for i in range(y_test_original.shape[1]):  # Iterate over target columns
    plt.plot(y_test_original[:, i], label=f'True Values - Target {i + 1}')
    plt.plot(y_pred_original[:, i], label=f'Predicted Values - Target {i + 1}', linestyle='dashed')
plt.legend()
plt.show()

predictions_df = pd.DataFrame(y_pred_original, columns=["Ticker 0", "Ticker 1"])
targets_df = pd.DataFrame(y_test_original, columns=["Ticker 0", "Ticker 1"])



In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert predictions and targets to DataFrames if not already
predictions_df = pd.DataFrame(y_pred_original, columns=["Ticker 0", "Ticker 1"])
targets_df = pd.DataFrame(y_test_original, columns=["Ticker 0", "Ticker 1"])

# Plot for Ticker 0
plt.figure(figsize=(20, 6))
plt.plot(targets_df["Ticker 0"], label="Actual Values - Ticker 0", color="blue")
plt.plot(predictions_df["Ticker 0"], label="Predicted Values - Ticker 0", linestyle="dashed", color="orange")
plt.title("Ticker 0: Actual vs Predicted")
plt.xlabel("Time Steps")
plt.ylabel("Values")
plt.legend()
plt.show()

# Plot for Ticker 1
plt.figure(figsize=(20, 6))
plt.plot(targets_df["Ticker 1"], label="Actual Values - Ticker 1", color="blue")
plt.plot(predictions_df["Ticker 1"], label="Predicted Values - Ticker 1", linestyle="dashed", color="orange")
plt.title("Ticker 1: Actual vs Predicted")
plt.xlabel("Time Steps")
plt.ylabel("Values")
plt.legend()
plt.show()


In [None]:
y_pred_original

In [None]:

import matplotlib.pyplot as plt

# Extract metrics from the history object
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_mape = history.history['MAE']
val_mape = history.history['val_MAE']

# Plot Loss
plt.figure(figsize=(10, 6))
plt.plot(train_loss, label='Training Loss', marker='o')
plt.plot(val_loss, label='Validation Loss', marker='x')
plt.title('Training and Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Plot MAPE
plt.figure(figsize=(10, 6))
plt.plot(train_mape, label='Training MAE', marker='o')
plt.plot(val_mape, label='Validation MAE', marker='x')
plt.title('Training and Validation MAE over Epochs')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()
plt.grid(True)
plt.show()