# Stock Price Prediction (Research)

## Imports

In [None]:
import yfinance as yf
import pandas as pd
import datetime
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



## Loading in the stock data

In [None]:
# Scrape S&P 500 tickers from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
data = pd.read_html(url)
sp500_tickers = data[0]['Symbol'].tolist()


In [None]:
# Get the current date
end_date = datetime.datetime.now().strftime("%Y-%m-%d")

# start date
start_date = '1990-01-01'

sp500_data = {}

# Loop through each ticker in the S&P 500 list
for ticker in sp500_tickers:
    # Download historical data for the ticker from Yahoo Finance
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Store the data in a Pandas DataFrame with the ticker as the column name
    sp500_data[ticker] = pd.DataFrame(data)




In [None]:
# Print the first few rows of each DataFrame
for ticker, data_frame in sp500_data.items():
    print(f"First few rows of {ticker}:")
    print(data_frame.head())


## Checking for missing values

In [None]:
# Loop through each ticker in the S&P 500 list
for ticker in sp500_tickers:
    # Download historical data for the ticker from Yahoo Finance
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Store the data in a Pandas DataFrame with the ticker as the column name
    sp500_data[ticker] = pd.DataFrame(data)
    
    # Check for missing values
    missing_values = sp500_data[ticker].isnull().sum()
    
    # missing values for each ticker
    print(f"Missing values for {ticker}:")
    print(missing_values)


In [None]:
print(sp500_data['AAPL']['Open'])

There was no timezone found for 'BKR' and 'BR', hency why we are going to remove the tickers

In [None]:
#['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')
#['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 1990-01-01 -> 2024-02-11)')

In [None]:
tickers_to_drop = ['BRK', 'BF'] 

for symbol in tickers_to_drop:
    sp500_data.pop(symbol, None)


In [None]:
# Print keys before removal
print("Keys before removal:", sp500_data.keys())

# Remove tickers
tickers_to_drop = ['BRK', 'BF','BRK.B','BF.B','IQV']
for symbol in tickers_to_drop:
    sp500_data.pop(symbol, None)

# Print keys after removal
print("Keys after removal:", sp500_data.keys())


In [None]:
ticker_to_check = 'BRK.B' and 'BF.B' and 'IQV'
print(ticker_to_check in sp500_data.keys()) 

There are no missing values for any of the stock tickers. We can proceed.

## Visualization of Stock Prices

Before we apply the engineering on the features and apply the machine learning models, we are going to plot the stock tickers to see the general movement of the stocks.

In [None]:
# Loop through each ticker in the S&P 500 list
for ticker in sp500_tickers:
    # closing prices for the current ticker
    closing_prices = sp500_data[ticker]['Close']
    
    # Plot the closing prices
    plt.figure(figsize=(10, 6))
    plt.plot(closing_prices, label=ticker)
    plt.title(f"Closing Prices for {ticker}")
    plt.xlabel("Date")
    plt.ylabel("Closing Price")
    plt.legend()
    plt.grid(True)
    plt.show()

## Feature engineering

First of all, we are going to predict the stock prices by just simply using the available features. This will serve as a baseline for the upcoming improvements that will be applied.

In the following, a target variable column is created.By calculating the difference between the closing and opening prices of a stock, we aim to capture the daily price movement, which serves as the target variable for machine learning algorithms. Shifting the target values by one row ensures that each day's target corresponds to the price movement on the subsequent trading day, facilitating the training of predictive models to forecast future price changes based on historical data.

Furthermore, we are going to use the target column in order to define a new column 'Direction'. The direction column willindicate if the price is going to go up (+1) or down (-1).

In [None]:
#for ticker, df in sp500_data.items():
    #sp500_data[ticker]['Target'] = sp500_data[ticker]['Adj Close'] - sp500_data[ticker]['Open']
    #sp500_data[ticker]['Target'] = sp500_data[ticker]['Target'].shift(-1)
    
    # Add a new column to indicate the direction of price change
    #sp500_data[ticker]['Direction'] = np.where(sp500_data[ticker]['Target'] > 0, 1, -1)


In [None]:
for ticker, df in sp500_data.items():
    sp500_data[ticker]['Tomorrow'] = sp500_data[ticker]['Adj Close'].shift(-1)
    sp500_data[ticker]['Target'] = np.where(sp500_data[ticker]['Tomorrow'] > sp500_data[ticker]['Adj Close'], 1, -1)

In [None]:
print(sp500_data['AAPL']) 

We will use this as a basis for predicting the price movement for the next day.

### Scaling

We are going to scale the data using MinMaxScaler

The last row of the target column which should represent the stock movement that needs to be predicted is nan. We are going to remove that row from each ticker and apply the models

In [None]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply scaling to each ticker
for ticker, df in sp500_data.items():
    # Remove rows with NaN values in the 'Tomorrow' column
    df.dropna(subset=['Tomorrow'], inplace=True)
    
    # Check if there are still rows in the DataFrame
    if not df.empty:
        # Apply scaling to numeric columns
        numeric_cols = df.select_dtypes(include='number').columns.drop(['Target', 'Tomorrow'])
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [None]:
# Check for NaN values in each column
nan_counts = df.isna().sum()

# Print the number of NaN values in each column
print("Number of NaN values in each column:")
print(nan_counts)


In [None]:
# Iterate over each ticker and its corresponding DataFrame
for ticker, df in sp500_data.items():
    # Drop the last row
    sp500_data[ticker] = df.drop(df.tail(1).index, inplace=False)

In [None]:
print(sp500_data['AAPL']) 

In [None]:
# Iterate over each ticker and its corresponding DataFrame
for ticker, df in sp500_data.items():
    # Separate features and target variables
    X = df.drop(columns=['Target'])
    y = df['Target']
# Check for NaN values in y
    if y.isna().any():
        print(f"NaN values found in the target variable for ticker {ticker}.")
    else:
        print(f"No NaN values found in the target variable for ticker {ticker}.")

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Dictionary to store the models and evaluation results
models = {}

# Iterate over each ticker and its corresponding DataFrame
for ticker, df in sp500_data.items():
    # Separate features 
    X = df.drop(columns=['Target','Tomorrow']) 
    y = df['Target']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize Logistic Regression model
    model = LogisticRegression()
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Evaluate the model on the testing data
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store the model and evaluation results
    models[ticker] = {'model': model, 'accuracy': accuracy}

# Access individual model and evaluation results
for ticker, info in models.items():
    print(f"Ticker: {ticker}, Accuracy: {info['accuracy']}")


## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

models = {}

# Iterate over each ticker and its corresponding DataFrame
for ticker, df in sp500_data.items():
    
    X = df.drop(columns=['Target','Tomorrow'])
    y = df['Target']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize Random Forest Classifier model
    model = RandomForestClassifier(n_estimators=10, random_state=42)
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Evaluate the model on the testing data
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store the model and evaluation results
    models[ticker] = {'model': model, 'accuracy': accuracy}

# Access individual model and evaluation results
for ticker, info in models.items():
    print(f"Ticker: {ticker}, Accuracy: {info['accuracy']}")


In [None]:
# Initialize an empty list to store accuracy scores
accuracy_scores = []

# Iterate over each ticker and its corresponding accuracy
for ticker, info in models.items():
    accuracy_scores.append(info['accuracy'])

# Calculate the average accuracy
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)

print(f"Average Accuracy: {average_accuracy}")


## LSTM

Next, we are going to apply LSTM. 

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

# Data Preparation
# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert data to sequences
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length])
    return np.array(X), np.array(y)

sequence_length = 60  # Define the length of input sequences
X_train_seq, y_train_seq = create_sequences(X_train_scaled, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, sequence_length)

# Model Architecture
model = Sequential([
    LSTM(units=50, return_sequences=True, input_shape=(sequence_length, X_train_seq.shape[2])),
    Dropout(0.2),
    LSTM(units=50, return_sequences=True),
    Dropout(0.2),
    LSTM(units=50),
    Dropout(0.2),
    Dense(units=1)  # Output layer
])

# Model Compilation
model.compile(optimizer='adam', loss='mean_squared_error')

# Model Training
model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=32)

# Model Evaluation
loss = model.evaluate(X_test_seq, y_test_seq)

# Prediction
predictions = model.predict(X_test_seq)
