In [None]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM
import joblib

def download_and_clean_stock_data(stock):
    """
    Download stock data for the Magnificent Seven, clean it, and save as CSV files.
    """
   
    output_dir = "raw_stock_data"
    os.makedirs(output_dir, exist_ok=True)

    def save_cleaned_data(data, stock, output_path):
        # Reset index to turn the Date index into a column
        data.reset_index(inplace=True)
        # Retain only the desired columns
        data = data[["Date", "Open", "High", "Low", "Close", "Volume"]]
        # Add a Ticker column
        data["Ticker"] = stock
        # Save the data to CSV
        data.to_csv(output_path, index=False)
        # Reload and remove the first row (if needed)
        data = pd.read_csv(output_path)
        data = data.iloc[1:]
        data.to_csv(output_path, index=False)

    for stock in stocks:
        data = yf.download(stock, start="2015-01-01", end="2025-01-01")
        output_path = os.path.join(output_dir, f"{stock}.csv")
        save_cleaned_data(data, stock, output_path)
        # print(f"Saved cleaned data for {stock} at {output_path}")

def preprocess_data(stock):
    """
    Preprocess each stock's data by converting dates, sorting, and adding new features.
    The processed CSV files are saved into a separate folder.
    """
    data_dir = "raw_stock_data"
    output_dir = "processed_stock_data"
    os.makedirs(output_dir, exist_ok=True)

    def preprocess_stock_data(file_path):
        data = pd.read_csv(file_path)
        # Convert Date to datetime and sort
        data["Date"] = pd.to_datetime(data["Date"])
        data.sort_values(by="Date", inplace=True)
        # Calculate additional features
        data["Daily Return"] = data["Close"].pct_change()
        data["5-Day Moving Avg"] = data["Close"].rolling(window=5).mean()
        data["10-Day Volatility"] = data["Close"].rolling(window=10).std()
        # Remove rows with NaN values from rolling calculations
        data.dropna(inplace=True)
        return data

    for file_name in os.listdir(data_dir):
        if file_name.endswith(".csv"):
            stock_file_path = os.path.join(data_dir, file_name)
            stock_name = os.path.splitext(file_name)[0]
            # print(f"Processing data for {stock_name}...")
            processed_data = preprocess_stock_data(stock_file_path)
            output_path = os.path.join(output_dir, f"{stock_name}_processed.csv")
            processed_data.to_csv(output_path, index=False)
            # print(f"Saved processed data for {stock_name} to {output_path}")
    # print("Data preprocessing complete!")

def load_stock_data(file_path):
    """
    Load processed data from a CSV file and prepare features and target.
    The target is the next day's closing price.
    """
    data = pd.read_csv(file_path)
    features = data[["Open", "High", "Low", "Close", "Volume", "5-Day Moving Avg", "10-Day Volatility"]]
    target = data["Close"].shift(-1)
    # Drop the last row (where target is NaN)
    features = features[:-1]
    target = target[:-1]
    return features, target

def train_model(ticker):
    """
    Load the AAPL processed data, split into training and testing sets,
    build and train a neural network with LSTM layers, and save the model.
    """
    processed_data_dir = "processed_stock_data"
    stock_file_path = os.path.join(processed_data_dir, f"{ticker}_processed.csv")
    features, target = load_stock_data(stock_file_path)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        features.values, target.values, test_size=0.2, random_state=42
    )

    # Reshape data for LSTM: (samples, timesteps, features)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(optimizer="adam", loss="mean_squared_error")

    # print("Training the model...")
    model.fit(X_train, y_train, epochs=1000, batch_size=32, verbose=1)

    # print("Evaluating the model on the test set...")
    loss = model.evaluate(X_test, y_test)
    # print(f"Test Loss: {loss}")

    # Save the trained model
    joblib.dump(model,f"models/{ticker}_model.joblib")
    # model.save("models/AAPL_model.h5")
    # print("Model saved as 'models/AAPL_model.h5'")

def evaluate_model():
    """
    Evaluate the saved model on a portion of AAPL's data and # print error metrics.
    Also, predict the next few days' closing prices and save these predictions to CSV.
    """
    processed_data_dir = "processed_stock_data"
    model_path = "stock_price_predictor.h5"
    model = load_model(model_path)

    def load_evaluation_data(file_path):
        data = pd.read_csv(file_path)
        features = data[["Open", "High", "Low", "Close", "Volume", "5-Day Moving Avg", "10-Day Volatility"]]
        target = data["Close"].shift(-1)
        features = features[:-1]
        target = target[:-1]
        # Use the last 10 rows for evaluation (e.g., next days' predictions)
        evaluation_features = features[-10:]
        evaluation_target = target[-10:]
        # Remove the evaluation portion from the main dataset
        features = features[:-10]
        target = target[:-10]
        return features, target, evaluation_features, evaluation_target

    stock_file_path = os.path.join(processed_data_dir, "AAPL_processed.csv")
    features, target, eval_features, eval_target = load_evaluation_data(stock_file_path)

    # Reshape data for LSTM input
    X_train = np.reshape(features.values, (features.shape[0], features.shape[1], 1))
    X_eval = np.reshape(eval_features.values, (eval_features.shape[0], eval_features.shape[1], 1))

    # Evaluate on the training portion (without the last few evaluation rows)
    predicted_train = model.predict(X_train)
    mse = mean_squared_error(target, predicted_train)
    mae = mean_absolute_error(target, predicted_train)
    r2 = r2_score(target, predicted_train)
    
    # print("### Model Evaluation on Training Data ###")
    # print(f"Mean Squared Error (MSE): {mse:.4f}")
    # print(f"Mean Absolute Error (MAE): {mae:.4f}")
    # print(f"R-squared (R2): {r2:.4f}")

    # Predict the next days' prices using the evaluation features
    predicted_eval = model.predict(X_eval)
    results = pd.DataFrame({
        "Index": eval_features.index,
        "Actual Price": eval_target.values,
        "Predicted Price": predicted_eval.flatten()
    })

    # print("### Next Days' Predictions ###")
    # print(results)

    # Save predictions to a CSV file
    # results.to_csv("next_month_predictions.csv", index=False)
    # print("Predictions saved as 'next_month_predictions.csv'")

def main(ticker):
    # print("Starting the stock data pipeline...")
    download_and_clean_stock_data(ticker)
    preprocess_data(ticker)
    train_model(ticker)
    # evaluate_model()
    # print("Pipeline complete!")


for ticker in ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA"]:
    main(ticker)



[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Ticker"] = stock
[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Ticker"] = stock
[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

Epoch 1/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 12643.4326 
Epoch 2/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 10722.3262
Epoch 3/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 10446.5098
Epoch 4/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9651.2930
Epoch 5/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9742.2588
Epoch 6/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 8992.3408
Epoch 7/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9080.9004
Epoch 8/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 8505.6045
Epoch 9/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7902.9692
Epoch 10/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Ticker"] = stock
[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Ticker"] = stock
[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

Epoch 1/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 46140.3320 
Epoch 2/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 43854.6250
Epoch 3/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 42469.1172
Epoch 4/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 42317.9414
Epoch 5/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 40241.2812
Epoch 6/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 39448.0781
Epoch 7/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 39347.2930
Epoch 8/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 37280.5273
Epoch 9/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 36369.5742
Epoch 10/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━

[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Ticker"] = stock
[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Ticker"] = stock
[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

Epoch 1/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8169.9595
Epoch 2/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6856.8442
Epoch 3/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6257.7749
Epoch 4/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5915.4692
Epoch 5/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5707.6567
Epoch 6/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5392.3569
Epoch 7/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4763.9761
Epoch 8/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4407.0298
Epoch 9/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4278.1274
Epoch 10/1000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [None]:
stocks = ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA"]

for stock in stocks:
    predicting_data = pd.read_csv(f"processed_stock_data/{stock}_processed.csv")
    predicting_data = predicting_data.tail(20)
    predicting_data.to_csv(f"/Users/noel_personal/Repos/TIKR/my-react-app/backend/predicting_data/{stock}.csv",index=False)