In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

# Load data
def load_data(file_name):
    df = pd.read_csv(file_name)
    print("Loaded data:")
    print(df.head())
    return df

# Preprocess data
def preprocess_data(df, future_days):
    # Create a new column (the target or dependent variable) shifted 'future_days' units up
    df['Prediction'] = df[['close']].shift(-future_days)
    df = df[:-future_days]
    print("Data after shift operation:")
    print(df.head())
    return df

# Split data into train and test datasets
# Split data into train and test datasets
def split_data(df, future_days, test_size):
    # Create the independent data set (X)
    # Here, we will convert the dataframe to a numpy array and drop the prediction column
    X = np.array(df.drop(['Prediction', 'date'], 1))[:-future_days]

    # Create a new data frame with the scaled values
    data_scaled = df.copy()
    data_scaled = data_scaled.drop(['date'], 1)

    # Scale the data (values will be between 0 and 1) excluding the 'date' column
    scaler_X = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler_X.fit_transform(data_scaled), columns=data_scaled.columns, index=data_scaled.index)

    # Create the dependent data set (y)
    # Convert the dataframe to a numpy array
    y = np.array(data_scaled['Prediction'])

    scaler_y = MinMaxScaler()
    y = scaler_y.fit_transform(y.reshape(-1,1))

    # Split the data into 80% training and 20% testing
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    # Reshape the data to 3-Dimensional for LSTM
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

    return x_train, x_test, y_train, y_test, scaler_X, scaler_y
# Create the model
def create_model(x_train, lstm_units, dropout):
    model = Sequential()

    model.add(LSTM(units=lstm_units, return_sequences=True, input_shape=(x_train.shape[1], 1)))
    model.add(Dropout(dropout))

    model.add(LSTM(units=lstm_units, return_sequences=False))
    model.add(Dropout(dropout))

    model.add(Dense(25))
    model.add(Dense(1))

    return model

# Train the model
def train_model(df, future_days, test_size, lstm_units, dropout, epochs, batch_size):
    df = preprocess_data(df, future_days)
    X_train, X_test, y_train, y_test, scaler_X, scaler_y = split_data(df, future_days, test_size)
    model = create_model(X_train, lstm_units, dropout)

    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)

    return model, X_train, X_test, y_train, y_test, scaler_X, scaler_y

# Plot predictions
def plot_predictions(df, model, X_test, y_test, future_days, scaler_X, scaler_y):
    # Predictions for the earliest 'X_test' (the test dataset)
    y_pred_test = model.predict(X_test)
    y_pred_test_transformed = scaler_y.inverse_transform(y_pred_test)
    y_test_transformed = scaler_y.inverse_transform(y_test)

    # Predictions for the future
    X_future = df.drop(['Prediction', 'date'], axis=1)[:-future_days]
    X_future_scaled = scaler_X.transform(X_future)
    X_future_scaled = np.reshape(X_future_scaled, (X_future_scaled.shape[0], X_future_scaled.shape[1], 1))
    y_pred_future = model.predict(X_future_scaled)
    y_pred_future_transformed = scaler_y.inverse_transform(y_pred_future)

    # Getting the corresponding dates from the original dataframe
    dates = pd.to_datetime(df['date'])
    dates_test = dates[-len(y_test_transformed):]
    dates_future = dates[-len(y_pred_future_transformed):]

    # Plot
    plt.figure(figsize=(10,6))
    plt.plot(dates_test, y_test_transformed, color='blue', label='Real')
    plt.plot(dates_test, y_pred_test_transformed, color='red', label='Prediction')
    plt.plot(dates_future, y_pred_future_transformed, color='green', label='Forecast')
    plt.title('Stock Price Prediction')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.show()



if __name__ == "__main__":
    # Set your file path
    file_name = 'yhoofinance-daily-historical-data/TSLA_daily_data.csv'

    # Load data from S3
    df = load_data_from_s3(file_name)
    
    # Set your parameters
    future_days = 30
    test_size = 0.2
    lstm_units = 50
    dropout = 0.2
    epochs = 20
    batch_size = 32

    # Train the model
    model, X_train, X_test, y_train, y_test, scaler_X, scaler_y = train_model(df, future_days, test_size, lstm_units, dropout, epochs, batch_size)
    # Plot predictions
    plot_predictions(df, model, X_test, y_test, scaler_X, scaler_y)

Loaded data:
         date       open       high        low      close  adj_close    volume
0  2015-01-02  14.858000  14.883333  14.217333  14.620667  14.620667  71466000
1  2015-01-05  14.303333  14.433333  13.810667  14.006000  14.006000  80527500
2  2015-01-06  14.004000  14.280000  13.614000  14.085333  14.085333  93928500
3  2015-01-07  14.223333  14.318667  13.985333  14.063333  14.063333  44526000
4  2015-01-08  14.187333  14.253333  14.000667  14.041333  14.041333  51637500
Data after shift operation:
         date       open       high        low      close  adj_close  \
0  2015-01-02  14.858000  14.883333  14.217333  14.620667  14.620667   
1  2015-01-05  14.303333  14.433333  13.810667  14.006000  14.006000   
2  2015-01-06  14.004000  14.280000  13.614000  14.085333  14.085333   
3  2015-01-07  14.223333  14.318667  13.985333  14.063333  14.063333   
4  2015-01-08  14.187333  14.253333  14.000667  14.041333  14.041333   

     volume  Prediction  
0  71466000   13.623333  


  X = np.array(df.drop(['Prediction', 'date'], 1))[:-future_days]
  data_scaled = data_scaled.drop(['date'], 1)


ValueError: Found input variables with inconsistent numbers of samples: [2093, 2123]