In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
from sklearn.metrics import mean_squared_error
from transformers import TFBertForSequenceClassification, BertTokenizerFast
from gensim.models import Word2Vec

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Function to calculate mean absolute percentage error
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
 # Read data from CSV file
df = pd.read_csv(r"C:\Users\ASatl\OneDrive\Desktop\let\AAPL.csv")
df = df[-200:]
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
5846,2023-03-29,159.369995,161.050003,159.350006,160.770004,160.120499,51305700
5847,2023-03-30,161.529999,162.470001,161.270004,162.360001,161.704056,49501700
5848,2023-03-31,162.440002,165.000000,161.910004,164.899994,164.233795,68749800
5849,2023-04-03,164.270004,166.289993,164.220001,166.169998,165.498672,56976200
5850,2023-04-04,166.600006,166.839996,165.110001,165.630005,164.960846,46278300
...,...,...,...,...,...,...,...
6041,2024-01-08,182.089996,185.600006,181.500000,185.559998,185.559998,59144500
6042,2024-01-09,183.919998,185.149994,182.729996,185.139999,185.139999,42841800
6043,2024-01-10,184.350006,186.399994,183.919998,186.190002,186.190002,46792900
6044,2024-01-11,186.539993,187.050003,183.619995,185.589996,185.589996,49128400


In [4]:
# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])
df['Date']

5846   2023-03-29
5847   2023-03-30
5848   2023-03-31
5849   2023-04-03
5850   2023-04-04
          ...    
6041   2024-01-08
6042   2024-01-09
6043   2024-01-10
6044   2024-01-11
6045   2024-01-12
Name: Date, Length: 200, dtype: datetime64[ns]

In [5]:
# Extracting the target variable 'Close' (dependent variable) from the DataFrame
y = df['Close']

# Extracting the feature 'Open' (independent variable) from the DataFrame
X = df['Open']

In [6]:
# Reshape the data as MLPRegressor expects a 2D array
X = X.values.reshape(-1, 1)
y = y.values

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create and train MLPRegressor model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train) #shape (160, 1)

In [9]:
# Forecast for the next day (t+1)
next_day_forecast = mlp_model.predict(X_test[-1].reshape(1, -1))
next_day_forecast

array([193.80098596])

In [10]:
# Forecast for the 7th day (t+7)
#Two model usage to avoid feature error message
seventh_day_forecast = mlp_model.predict(X_test[-1].reshape(1, -1))
values =[]
for i in range(6):
    seventh_day_forecast = mlp_model.predict(seventh_day_forecast.reshape(-1, 1))
    values.append(seventh_day_forecast)
    
print(values[-1])

[198.04694065]


In [11]:
df_Tweets = pd.read_csv(r"C:\Users\ASatl\OneDrive\Desktop\let\tweets.csv")
df_Tweets

Unnamed: 0,No.,Message
0,1.0,Excited to see what's next for $AAPL in the te...
1,2.0,$AAPL's stock performance reflects its solid f...
2,3.0,Wondering how $AAPL will capitalize on the lat...
3,4.0,$AAPL's innovation engine never ceases to amaze.
4,5.0,Anticipating the impact of $AAPL's upcoming pr...
...,...,...
201,,
202,,
203,,
204,,


In [12]:
# Remove rows with missing values in the 'Message' column
df_Tweets = df_Tweets.dropna(subset=['Message'])

In [13]:
# Sentiment analysis using Word2Vec
tweets_texts = df_Tweets['Message'].tolist()
tweets_tokens = [tweet_text.split() for tweet_text in tweets_texts]
Word2Vec_model = Word2Vec(tweets_tokens, vector_size=100, window=5, min_count=1, workers=4)

In [14]:
# Load pre-trained FinBERT model and tokenizer
finbert_model = TFBertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
finbert_tokenizer = BertTokenizerFast.from_pretrained('yiyanghkust/finbert-tone')




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at yiyanghkust/finbert-tone.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [15]:
# Tokenize tweet texts using FinBERT tokenizer
tokenized_tweets = finbert_tokenizer(tweets_texts, padding=True, truncation=True, return_tensors="tf")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
# Obtain predictions from FinBERT model
finbert_predictions = finbert_model.predict(tokenized_tweets)




In [17]:
# Extract sentiment scores from predictions
positive_scores = finbert_predictions[0][:, 0]  # Positive sentiment score
negative_scores = finbert_predictions[0][:, 1]  # Negative sentiment score
neutral_scores = finbert_predictions[0][:, 2]   # Neutral sentiment score

In [18]:
# Concatenate sentiment scores as additional features
additional_features = np.column_stack((positive_scores, negative_scores, neutral_scores))

In [19]:
# Concatenate additional features with existing features (X_train, X_test)
X_train_with_features = np.concatenate((X_train, additional_features[:len(X_train)]), axis=1)
X_test_with_features = np.concatenate((X_test, additional_features[len(X_train):]), axis=1)

In [20]:
mlp_model_with_features = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model_with_features.fit(X_train_with_features, y_train)

In [21]:
# Predictions with additional features
y_pred_test_with_features = mlp_model_with_features.predict(X_test_with_features)

In [22]:
# Predictions
y_pred_train = mlp_model.predict(X_train)
y_pred_test = mlp_model.predict(X_test)


In [23]:
# Calculate evaluation metrics for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
mae_test = np.mean(np.abs(y_test - y_pred_test))

In [24]:
# Print metrics for test set
print("Test Set Metrics:")
print(f'Root Mean Squared Error (RMSE): {rmse_test}')
print(f'Mean Absolute Percentage Error (MAPE): {mape_test}')
print(f'Mean Absolute Error (MAE): {mae_test}')

Test Set Metrics:
Root Mean Squared Error (RMSE): 1.8737916197324334
Mean Absolute Percentage Error (MAPE): 0.873180786287105
Mean Absolute Error (MAE): 1.585010003028995


In [25]:
#Calculate evaluation metrics for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test_with_features))
mape_test = mean_absolute_percentage_error(y_test, y_pred_test_with_features)
mae_test = np.mean(np.abs(y_test - y_pred_test_with_features))

In [26]:
# Print metrics for test set with feature calculated accuracy
print("Test Set Metrics Feature model:")
print(f'Root Mean Squared Error (RMSE): {rmse_test}')
print(f'Mean Absolute Percentage Error (MAPE): {mape_test}')
print(f'Mean Absolute Error (MAE): {mae_test}')

Test Set Metrics Feature model:
Root Mean Squared Error (RMSE): 1.676658304051121
Mean Absolute Percentage Error (MAPE): 0.7468899944422069
Mean Absolute Error (MAE): 1.3526617865327253


In [27]:
# Forecast for the next day (t+1)
next_day_forecast = mlp_model_with_features.predict(X_test_with_features[-1].reshape(1, -1))
next_day_forecast

array([192.94264142])

In [28]:
# Forecast for the 7th day (t+7)
#Two model usage to avoid feature error message
seventh_day_forecast = mlp_model_with_features.predict(X_test_with_features[-1].reshape(1, -1))
values =[]
for i in range(6):
    seventh_day_forecast = mlp_model.predict(seventh_day_forecast.reshape(-1, 1))
    values.append(seventh_day_forecast)
    
print(values[-1])

[197.174544]
