In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---- ----------------------------------- 17.3/150.0 MB 99.1 MB/s eta 0:00:02
   ---------- ---------------------------- 39.1/150.0 MB 103.5 MB/s eta 0:00:02
   --------------- ----------------------- 61.3/150.0 MB 102.8 MB/s eta 0:00:01
   --------------------- ----------------- 84.1/150.0 MB 105.3 MB/s eta 0:00:01
   -------------------------- ----------- 106.4/150.0 MB 106.2 MB/s eta 0:00:01
   ------------------------------- ------ 124.0/150.0 MB 102.9 MB/s eta 0:00:01
   ------------------------------------ - 142.1/150.0 MB 100.9 MB/s eta 0:00:01
   --------------------------------------  149.9/150.0 MB 99.8 MB/s eta 0:00:01
   --------------------------------------- 150.0/150.0 MB 89.5 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed

In [4]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 74.3 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input

In [21]:
# 1. Load and Preprocess Your S&P 500 Data (Using your existing code)
try:
    gspc_df = pd.read_csv('gspc_daily_price.csv', parse_dates=['Date'], encoding='latin1')
except UnicodeDecodeError:
    gspc_df = pd.read_csv('gspc_daily_price.csv', parse_dates=['Date'], encoding='cp1252')
vix_df = pd.read_csv('vix_daily_price.csv', parse_dates=['Date'])
sentiment_df = pd.read_csv('news_sentiment_data.csv', parse_dates=['date'])

gspc_df.rename(columns={'Close ': 'Close', 'Open ': 'Open', 'High ': 'High', 'Low ': 'Low', 'Volume ': 'Volume'}, inplace=True)
vix_df.rename(columns={'Close ': 'Close', 'Open ': 'Open', 'High ': 'High', 'Low ': 'Low'}, inplace=True)
sentiment_df.rename(columns={'date': 'Date'}, inplace=True)

gspc_df.sort_values(by='Date', inplace=True)
vix_df.sort_values(by='Date', inplace=True)
sentiment_df.sort_values(by='Date', inplace=True)

df = pd.merge(gspc_df, vix_df, on='Date', suffixes=('_sp500', '_vix'))
df = pd.merge(df, sentiment_df, on='Date', how='left')
df.fillna(0, inplace=True)

df.columns = df.columns.str.replace('\xa0', '')

  gspc_df = pd.read_csv('gspc_daily_price.csv', parse_dates=['Date'], encoding='latin1')
  vix_df = pd.read_csv('vix_daily_price.csv', parse_dates=['Date'])


In [22]:
def calculate_rsi(data, window=14):
    delta = data['Close_sp500'].diff(1)
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(window=window).mean()
    avg_loss = pd.Series(loss).rolling(window=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
df['RSI_14'] = calculate_rsi(df)
df['Close_sp500'] = pd.to_numeric(df['Close_sp500'], errors='coerce')
df.fillna(0, inplace=True)

In [23]:
# 2. Prepare Data for LSTM (Only Close Price)
close_prices_SP500 = df['Close_sp500'].values.reshape(-1, 1)

In [24]:
# 3. Normalize Data
scaler = MinMaxScaler(feature_range=(0, 1))
data_normalized = scaler.fit_transform(close_prices_SP500)

In [25]:
# 4. Split Data into Training and Testing Sets
train_size = int(len(data_normalized) * 0.8)
train_data = data_normalized[:train_size]
test_data = data_normalized[train_size:]

In [26]:
# 5. LSTM Model Creation and Hyperparameter Tuning
def create_lstm_model(units, activation, learning_rate):
    model = Sequential()
    model.add(Input(shape=(1, 1))) # Added Input layer
    model.add(LSTM(units=units, activation=activation)) #removed input shape from LSTM
    model.add(Dense(units=1))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

lstm_units = [50, 100, 200]
lstm_activations = ['relu', 'tanh']
learning_rates = [0.001, 0.01, 0.1]
epochs = 100
batch_size = 32

best_rmse = float('inf')
best_lstm_model = None

for units in lstm_units:
    for activation in lstm_activations:
        for learning_rate in learning_rates:
            model = create_lstm_model(units=units, activation=activation, learning_rate=learning_rate)
            model.fit(train_data[:-1].reshape(-1, 1, 1), train_data[1:], epochs=epochs, batch_size=batch_size, verbose=0)

            test_predictions = model.predict(test_data[:-1].reshape(-1, 1, 1)).flatten()
            rmse = np.sqrt(mean_squared_error(test_data[1:], test_predictions))

            if rmse < best_rmse:
                best_rmse = rmse
                best_lstm_model = model

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98

In [29]:
# 6. Predict on the Entire Dataset with the Best LSTM Model
all_lstm_predictions_scaled = best_lstm_model.predict(data_normalized[:-1].reshape(-1, 1, 1)).flatten()

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [30]:
# 7. Inverse Transform LSTM Predictions
all_lstm_predictions = scaler.inverse_transform(all_lstm_predictions_scaled.reshape(-1,1)).flatten()

In [31]:
# 8. Support Vector Machines (SVM) Model
svm_model = SVR()

svm_params = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1]
}

svm_grid_search = GridSearchCV(svm_model, svm_params, scoring='neg_mean_squared_error')
svm_grid_search.fit(np.arange(len(close_prices_SP500)).reshape(-1, 1), close_prices_SP500.ravel()) #use ravel here
svm_best_model = svm_grid_search.best_estimator_
svm_predictions = svm_best_model.predict(np.arange(len(close_prices_SP500)).reshape(-1, 1))

In [32]:
# 9. Random Forest Model
rf_model = RandomForestRegressor()

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}

rf_grid_search = GridSearchCV(rf_model, rf_params, scoring='neg_mean_squared_error')
rf_grid_search.fit(np.arange(len(close_prices_SP500)).reshape(-1, 1), close_prices_SP500.ravel()) #use ravel here
rf_best_model = rf_grid_search.best_estimator_
rf_predictions = rf_best_model.predict(np.arange(len(close_prices_SP500)).reshape(-1, 1))

In [33]:
# 10. Gradient Boosting Methods (XGBoost)
xgb_model = XGBRegressor()

xgb_params = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

xgb_grid_search = GridSearchCV(xgb_model, xgb_params, scoring='neg_mean_squared_error')
xgb_grid_search.fit(np.arange(len(close_prices_SP500)).reshape(-1, 1), close_prices_SP500.ravel()) #use ravel here
xgb_best_model = xgb_grid_search.best_estimator_
xgb_predictions = xgb_best_model.predict(np.arange(len(close_prices_SP500)).reshape(-1, 1))

In [34]:
# 11. Gradient Boosting Methods (LightGBM)
lgbm_model = LGBMRegressor()

lgbm_params = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_params, scoring='neg_mean_squared_error')
lgbm_grid_search.fit(np.arange(len(close_prices_SP500)).reshape(-1, 1), close_prices_SP500.ravel()) #use ravel here
lgbm_best_model = lgbm_grid_search.best_estimator_
lgbm_predictions = lgbm_best_model.predict(np.arange(len(close_prices_SP500)).reshape(-1, 1))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 1
[LightGBM] [Info] Start training from score 5742.132693
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 1
[LightGBM] [Info] Start training from score 5675.035750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 1
[LightGBM] [Info] Start training fro



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 1
[LightGBM] [Info] Start training from score 5551.615601
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 1
[LightGBM] [Info] Start training from score 5742.132693
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 1
[LightGBM] [Info] Start training fro



In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Load and preprocess your data (using your existing code)
try:
    gspc_df = pd.read_csv('gspc_daily_price.csv', parse_dates=['Date'], encoding='latin1')
except UnicodeDecodeError:
    gspc_df = pd.read_csv('gspc_daily_price.csv', parse_dates=['Date'], encoding='cp1252')
vix_df = pd.read_csv('vix_daily_price.csv', parse_dates=['Date'])
sentiment_df = pd.read_csv('news_sentiment_data.csv', parse_dates=['date'])
gspc_df.rename(columns={'Close ': 'Close', 'Open ': 'Open', 'High ': 'High', 'Low ': 'Low', 'Volume ': 'Volume'}, inplace=True)
vix_df.rename(columns={'Close ': 'Close', 'Open ': 'Open', 'High ': 'High', 'Low ': 'Low'}, inplace=True)
sentiment_df.rename(columns={'date': 'Date'}, inplace=True)
gspc_df.sort_values(by='Date', inplace=True)
vix_df.sort_values(by='Date', inplace=True)
sentiment_df.sort_values(by='Date', inplace=True)
df = pd.merge(gspc_df, vix_df, on='Date', suffixes=('_sp500', '_vix'))
df = pd.merge(df, sentiment_df, on='Date', how='left')
df.fillna(0, inplace=True)
df.columns = df.columns.str.replace('\xa0', '')
def calculate_rsi(data, window=14):
    delta = data['Close_sp500'].diff(1)
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(window=window).mean()
    avg_loss = pd.Series(loss).rolling(window=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
df['RSI_14'] = calculate_rsi(df)
df['Close_sp500'] = pd.to_numeric(df['Close_sp500'], errors='coerce')
df.fillna(0, inplace=True)

# Prepare data for all models
merged_data = df.copy()
merged_data.rename(columns={'Close_sp500': 'Close_SP500', 'Open_sp500': 'Open_SP500', 'High_sp500': 'High_SP500', 'Low_sp500': 'Low_SP500', 'Volume': 'Volume_SP500', 'RSI_14': 'RSI', 'Close_vix': 'Close_VIX', 'News Sentiment': 'News Sentiment'}, inplace=True)

# Feature Selection
features = ['Close_SP500', 'Open_SP500', 'High_SP500', 'Low_SP500', 'Volume_SP500', 'RSI', 'Close_VIX', 'News Sentiment']
target = 'Close_SP500'

# Normalize data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(merged_data[features])
scaled_target = merged_data[target].values.reshape(-1, 1) # target needs to be 2D for inverse transform
scaled_target = scaler.fit_transform(scaled_target)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_data, scaled_target, test_size=0.2, shuffle=False)

# LSTM Model
look_back = 10
def create_lstm_dataset(X, y, look_back):
    Xs, ys = [], []
    for i in range(len(X) - look_back):
        v = X[i:(i + look_back)]
        Xs.append(v)
        ys.append(y[i + look_back])
    return np.array(Xs), np.array(ys)

X_train_lstm, y_train_lstm = create_lstm_dataset(X_train, y_train, look_back)
X_test_lstm, y_test_lstm = create_lstm_dataset(X_test, y_test, look_back)

model_lstm = Sequential([
    LSTM(100, return_sequences=True, input_shape=(look_back, X_train.shape[1])),
    Dropout(0.2),
    LSTM(100, return_sequences=False),
    Dropout(0.2),
    Dense(50, activation='relu'),
    Dense(1)
])
model_lstm.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_lstm.fit(X_train_lstm, y_train_lstm, epochs=100, batch_size=32, validation_data=(X_test_lstm, y_test_lstm), callbacks=[early_stopping], verbose=0)

y_pred_lstm = model_lstm.predict(X_test_lstm, verbose=0)
y_pred_lstm_inv = scaler.inverse_transform(y_pred_lstm)
y_test_lstm_inv = scaler.inverse_transform(y_test_lstm)

# Other Models (SVR, RF, XGBoost, LightGBM)
models = {
    'SVR': (SVR(), {'C': [0.1, 1, 10], 'gamma': ['scale', 0.1, 1]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200], 'max_depth': [None, 10]}),
    'XGBoost': (XGBRegressor(), {'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}),
    'LightGBM': (LGBMRegressor(), {'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]})
}

tscv = TimeSeriesSplit(n_splits=5) #time series cross validation

for name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=tscv, scoring='neg_mean_squared_error', verbose=0)
    grid_search.fit(X_train, y_train.flatten())
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_inv = scaler.inverse_transform(y_pred.reshape(-1, 1))
    y_test_inv = scaler.inverse_transform(y_test)
    print(f"{name} Best Params: {grid_search.best_params_}")
    print(f"{name} MAE: {mean_absolute_error(y_test_inv, y_pred_inv):.2f}")
    print(f"{name} MSE: {mean_squared_error(y_test_inv, y_pred_inv):.2f}")
    print(f"{name} R2: {r2_score(y_test_inv, y_pred_inv):.2f}")

# LSTM Evaluation
print(f"LSTM MAE: {mean_absolute_error(y_test_lstm_inv, y_pred_lstm_inv):.2f}")
print(f"LSTM MSE: {mean_squared_error(y_test_lstm_inv, y_pred_lstm_inv):.2f}")
print(f"LSTM R2: {r2_score(y_test_lstm_inv, y_pred_lstm_inv):.2f}")

  gspc_df = pd.read_csv('gspc_daily_price.csv', parse_dates=['Date'], encoding='latin1')
  vix_df = pd.read_csv('vix_daily_price.csv', parse_dates=['Date'])
  super().__init__(**kwargs)


SVR Best Params: {'C': 10, 'gamma': 0.1}
SVR MAE: 89.75
SVR MSE: 9539.76
SVR R2: 0.63
Random Forest Best Params: {'max_depth': 10, 'n_estimators': 200}
Random Forest MAE: 14.35
Random Forest MSE: 392.61
Random Forest R2: 0.98
XGBoost Best Params: {'learning_rate': 0.1, 'max_depth': 5}
XGBoost MAE: 9.27
XGBoost MSE: 208.28
XGBoost R2: 0.99
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 0
[LightGBM] [Info] Start training from score 0.144828
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 177
[LightGBM] [Info] Number of data points in the train set: 68, number of used features: 8
[LightGBM] [Info] Start training from score 0.222767
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove t



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 497
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 8
[LightGBM] [Info] Start training from score 0.496538
LightGBM Best Params: {'learning_rate': 0.1, 'max_depth': 3}
LightGBM MAE: 29.21
LightGBM MSE: 1797.61
LightGBM R2: 0.93
LSTM MAE: 116.14
LSTM MSE: 17610.55
LSTM R2: 0.43




###### 

In [None]:
import pandas as pd
# Load the stock data
file_path = r'C:\Users\numan.yaqoob\Desktop\UP\finance ML Articles\AAPL_short_volume.csv'
data = pd.read_csv(file_path)
close_prices_AAPL = data['Close']

# Reverse the order of the data
close_prices_AAPL_reverse = close_prices_AAPL.iloc[::-1]

# Reset index to maintain the correct time series order in the plot
close_prices_AAPL_reverse.reset_index(drop=True, inplace=True)

# Plot the line chart
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(close_prices_AAPL_reverse)
plt.xlabel('Time')
plt.ylabel('Close Prices')
plt.title('AAPL Stock Close Prices')
plt.grid(True)
plt.show()

# Data preprocessing
data = close_prices_AAPL_reverse.values.reshape(-1, 1)  # Reshape the data
data_normalized = data / np.max(data)  # Normalize the data

# Split the data into training and testing sets
train_size = int(len(data_normalized) * 0.8)
train_data = data_normalized[:train_size]
test_data = data_normalized[train_size:]

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Function to create LSTM model
def create_lstm_model(units, activation, learning_rate):
    model = Sequential()
    model.add(LSTM(units=units, activation=activation, input_shape=(1, 1)))
    model.add(Dense(units=1))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Define hyperparameters for tuning
lstm_units = [50, 100, 200]
lstm_activations = ['relu', 'tanh']
learning_rates = [0.001, 0.01, 0.1]
epochs = 100
batch_size = 32

# Perform hyperparameter tuning for LSTM model
best_rmse = float('inf')
best_lstm_model = None

from sklearn.metrics import mean_squared_error

for units in lstm_units:
    for activation in lstm_activations:
        for learning_rate in learning_rates:
            # Create and train LSTM model
            model = create_lstm_model(units=units, activation=activation, learning_rate=learning_rate)
            model.fit(train_data[:-1].reshape(-1, 1, 1), train_data[1:], epochs=epochs, batch_size=batch_size, verbose=0)

            # Predict on test data
            test_predictions = model.predict(test_data[:-1].reshape(-1, 1, 1)).flatten()

            # Calculate RMSE
            rmse = np.sqrt(mean_squared_error(test_data[1:], test_predictions))

            # Check if current model has lower RMSE
            if rmse < best_rmse:
                best_rmse = rmse
                best_lstm_model = model

# Predict on the entire dataset using the best LSTM model
all_lstm_predictions = best_lstm_model.predict(data_normalized[:-1].reshape(-1, 1, 1)).flatten()

# Inverse normalize the LSTM predictions
all_lstm_predictions = all_lstm_predictions * np.max(data)

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Support Vector Machines (SVM) Model
svm_model = SVR()

svm_params = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1]
}

svm_grid_search = GridSearchCV(svm_model, svm_params, scoring='neg_mean_squared_error')
svm_grid_search.fit(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1), close_prices_AAPL_reverse)
svm_best_model = svm_grid_search.best_estimator_
svm_predictions = svm_best_model.predict(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1))

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Random Forest Model
rf_model = RandomForestRegressor()

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}

rf_grid_search = GridSearchCV(rf_model, rf_params, scoring='neg_mean_squared_error')
rf_grid_search.fit(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1), close_prices_AAPL_reverse)
rf_best_model = rf_grid_search.best_estimator_
rf_predictions = rf_best_model.predict(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1))

# Gradient Boosting Methods (XGBoost)
xgb_model = XGBRegressor()

xgb_params = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

xgb_grid_search = GridSearchCV(xgb_model, xgb_params, scoring='neg_mean_squared_error')
xgb_grid_search.fit(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1), close_prices_AAPL_reverse)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_predictions = xgb_best_model.predict(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1))

# Gradient Boosting Methods (LightGBM)
lgbm_model = LGBMRegressor()

lgbm_params = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_params, scoring='neg_mean_squared_error')
lgbm_grid_search.fit(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1), close_prices_AAPL_reverse)
lgbm_best_model = lgbm_grid_search.best_estimator_
lgbm_predictions = lgbm_best_model.predict(np.arange(len(close_prices_AAPL_reverse)).reshape(-1, 1))

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# List of model names and predictions
model_names = ['Support Vector Machines (SVM)', 'Random Forest', 'XGBoost', 'LightGBM']
predictions = [svm_predictions, rf_predictions, xgb_predictions, lgbm_predictions]
best_models = [svm_best_model, rf_best_model, xgb_best_model, lgbm_best_model]

# Truncate actual values to match the length of predictions
actual_values_reverse = close_prices_AAPL_reverse[-len(svm_predictions):]

# Evaluate models and plot graphs
for i, model_name in enumerate(model_names):
    model_prediction = predictions[i]
    model_prediction_truncated = model_prediction[-len(actual_values_reverse):]  # Truncate predicted values
    model_rmse = rmse(actual_values_reverse, model_prediction_truncated)

    # Plotting actual and predicted values
    plt.figure(figsize=(10, 6))
    plt.plot(actual_values_reverse, label='Actual')
    plt.plot(model_prediction_truncated, label='Predicted')
    plt.title(f"{model_name} - RMSE: {model_rmse:.2f}")
    plt.xlabel('Time')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.show()

    # Print the best hyperparameters for the model
    best_model = best_models[i]
    print(f"Best Hyperparameters for {model_name}:")
    print(best_model)
    print("-----------------------------")

# Plotting LSTM predictions
plt.figure(figsize=(10, 6))
plt.plot(actual_values_reverse, label='Actual')
plt.plot(all_lstm_predictions, label='LSTM Predicted')
plt.title(f"LSTM Model - RMSE: {best_rmse:.2f}")
plt.xlabel('Time')
plt.ylabel('Stock Price')
plt.legend()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Load the stock data
file_path = r'C:\Users\numan.yaqoob\Desktop\UP\finance ML Articles\AAPL_short_volume.csv'
data = pd.read_csv(file_path)
close_prices_AAPL = data['Close']

# Reverse the order of the data
close_prices_AAPL_reverse = close_prices_AAPL.iloc[::-1]

# Reset index to maintain the correct time series order in the plot
close_prices_AAPL_reverse.reset_index(drop=True, inplace=True)

# Data preprocessing
data = close_prices_AAPL_reverse.values.reshape(-1, 1)  # Reshape the data
data_normalized = data / np.max(data)  # Normalize the data

# Split the data into training and testing sets
train_size = int(len(data_normalized) * 0.8)
train_data = data_normalized[:train_size]
test_data = data_normalized[train_size:]

# Function to create LSTM model
def create_lstm_model(units, activation, learning_rate):
    model = Sequential()
    model.add(LSTM(units=units, activation=activation, input_shape=(1, 1)))
    model.add(Dense(units=1))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Define hyperparameters for tuning
lstm_units = [50, 100, 200]
lstm_activations = ['relu', 'tanh']
learning_rates = [0.001, 0.01, 0.1]
epochs = 100
batch_size = 32

# Perform hyperparameter tuning for LSTM model
best_rmse = float('inf')
best_lstm_model = None

for units in lstm_units:
    for activation in lstm_activations:
        for learning_rate in learning_rates:
            # Create and train LSTM model
            model = create_lstm_model(units=units, activation=activation, learning_rate=learning_rate)
            model.fit(train_data[:-1].reshape(-1, 1, 1), train_data[1:], epochs=epochs, batch_size=batch_size, verbose=0)

            # Predict on test data
            test_predictions = model.predict(test_data[:-1].reshape(-1, 1, 1)).flatten()

            # Calculate RMSE
            rmse = np.sqrt(mean_squared_error(test_data[1:], test_predictions))

            # Check if current model has lower RMSE
            if rmse < best_rmse:
                best_rmse = rmse
                best_lstm_model = model

# Predict on the entire dataset using the best LSTM model
all_lstm_predictions = best_lstm_model.predict(data_normalized[:-1].reshape(-1, 1, 1)).flatten()

# Inverse normalize the LSTM predictions
all_lstm_predictions = all_lstm_predictions * np.max(data)

# Calculate the scaling factor based on the maximum value of the original data
scaling_factor = np.max(close_prices_AAPL_reverse)

# Function to predict future stock prices using the LSTM model
def predict_future_lstm(model, data, num_predictions, scaling_factor):
    predictions = []

    # Get the last data point from the input data
    last_data_point = data[-1]

    for _ in range(num_predictions):
        # Predict the next time step
        prediction = model.predict(last_data_point.reshape(1, 1, 1))
        predictions.append(prediction[0, 0])

        # Update last_data_point to include the predicted value for the next iteration
        last_data_point = np.append(last_data_point[1:], prediction)

    # Inverse normalize the predictions
    predictions = np.array(predictions) * scaling_factor

    return predictions

# Predict the next 10 days using the LSTM model
num_predictions = 10
lstm_predictions = predict_future_lstm(best_lstm_model, data_normalized, num_predictions, scaling_factor)

# Plot the LSTM predictions for the next 10 days
plt.figure(figsize=(10, 6))
plt.plot(close_prices_AAPL_reverse, label='Actual')
plt.plot(np.arange(len(close_prices_AAPL_reverse), len(close_prices_AAPL_reverse) + num_predictions), lstm_predictions, label='LSTM Predicted')
plt.title(f"LSTM Model - RMSE: {best_rmse:.2f}")
plt.xlabel('Time')
plt.ylabel('Stock Price')
plt.legend()
plt.show()

# Print the predicted stock prices for the next 10 days using LSTM
print("Predicted stock prices for the next 10 days:")
for i, prediction in enumerate(lstm_predictions, start=1):
    print(f"Day {i}: {prediction:.2f}")
