In [2]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizerFast
from gensim.models import Word2Vec

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [4]:
df_stock = pd.read_csv(r"C:/Users/Asatl/onedrive/desktop/let/AAPL.csv")
df_stock = df_stock[-200:]
df_stock

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
5846,2023-03-29,159.369995,161.050003,159.350006,160.770004,160.120499,51305700
5847,2023-03-30,161.529999,162.470001,161.270004,162.360001,161.704056,49501700
5848,2023-03-31,162.440002,165.000000,161.910004,164.899994,164.233795,68749800
5849,2023-04-03,164.270004,166.289993,164.220001,166.169998,165.498672,56976200
5850,2023-04-04,166.600006,166.839996,165.110001,165.630005,164.960846,46278300
...,...,...,...,...,...,...,...
6041,2024-01-08,182.089996,185.600006,181.500000,185.559998,185.559998,59144500
6042,2024-01-09,183.919998,185.149994,182.729996,185.139999,185.139999,42841800
6043,2024-01-10,184.350006,186.399994,183.919998,186.190002,186.190002,46792900
6044,2024-01-11,186.539993,187.050003,183.619995,185.589996,185.589996,49128400


In [5]:
# Convert the 'Date' column to datetime format
df_stock['Date'] = pd.to_datetime(df_stock['Date'])


In [6]:
# Extracting the target variable 'Close' (dependent variable) from the DataFrame
y = df_stock['Close']

# Extracting the feature 'Open' (independent variable) from the DataFrame
X = df_stock['Open']

In [7]:
# Reshape the data as MLPRegressor expects a 2D array
X = X.values.reshape(-1, 1)
y = y.values

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
arima_model = ARIMA(y_train ,order=(5,1,0))
arima_model_fit = arima_model.fit()

In [10]:
# Forecasting
forecast_steps = len(y_test)  # Forecast for the length of the test set
forecast = arima_model_fit.forecast(steps=forecast_steps)

In [11]:
# Forecast T+1
forecast_steps = 1
forecast_T1 = arima_model_fit.forecast(steps=forecast_steps)
print("Forecasts for the T+1:")
print(forecast_T1)

Forecasts for the T+1:
[181.8143647]


In [12]:
# Forecast T+7
forecast_steps = 7
forecast_T1 = arima_model_fit.forecast(steps=forecast_steps)
print("Forecasts for the T+7:")
print(forecast_T1)

Forecasts for the T+7:
[181.8143647  180.01493013 182.41845257 183.23032458 182.07883239
 181.42262658 181.77849107]


In [13]:
# Calculate evaluation metrics for test set
rmse_test = np.sqrt(mean_squared_error(y_test, forecast))
mape_test = mean_absolute_percentage_error(y_test, forecast)
mae_test = np.mean(np.abs(y_test - forecast))

# Print metrics for test set
print("Test Set Metrics:")
print(f'Root Mean Squared Error (RMSE): {rmse_test}')
print(f'Mean Absolute Percentage Error (MAPE): {mape_test}')
print(f'Mean Absolute Error (MAE): {mae_test}')

Test Set Metrics:
Root Mean Squared Error (RMSE): 9.984752410044385
Mean Absolute Percentage Error (MAPE): 4.8608461227649045
Mean Absolute Error (MAE): 8.795999586135926


In [14]:
df_Tweets = pd.read_csv(r"C:\Users\ASatl\OneDrive\Desktop\let\tweets.csv")
df_Tweets

Unnamed: 0,No.,Message
0,1.0,Excited to see what's next for $AAPL in the te...
1,2.0,$AAPL's stock performance reflects its solid f...
2,3.0,Wondering how $AAPL will capitalize on the lat...
3,4.0,$AAPL's innovation engine never ceases to amaze.
4,5.0,Anticipating the impact of $AAPL's upcoming pr...
...,...,...
201,,
202,,
203,,
204,,


In [15]:
# Remove rows with missing values in the 'Message' column
df_Tweets = df_Tweets.dropna(subset=['Message'])

In [16]:
# Sentiment analysis using Word2Vec
tweets_texts = df_Tweets['Message'].tolist()
tweets_tokens = [tweet_text.split() for tweet_text in tweets_texts]
Word2Vec_model = Word2Vec(tweets_tokens, vector_size=100, window=5, min_count=1, workers=4)

In [17]:
# Load pre-trained FinBERT model and tokenizer
finbert_model = TFBertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
finbert_tokenizer = BertTokenizerFast.from_pretrained('yiyanghkust/finbert-tone')




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at yiyanghkust/finbert-tone.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [18]:
# Tokenize tweet texts using FinBERT tokenizer
tokenized_tweets = finbert_tokenizer(tweets_texts, padding=True, truncation=True, return_tensors="tf")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [19]:
# Obtain predictions from FinBERT model
finbert_predictions = finbert_model.predict(tokenized_tweets)




In [20]:
# Extract sentiment scores from predictions
positive_scores = finbert_predictions[0][:, 0]  # Positive sentiment score
negative_scores = finbert_predictions[0][:, 1]  # Negative sentiment score
neutral_scores = finbert_predictions[0][:, 2]   # Neutral sentiment score

In [21]:
# Concatenate sentiment scores as additional features
additional_features = np.column_stack((positive_scores, negative_scores, neutral_scores))

In [22]:
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [23]:
# Concatenate additional features with existing features (y_train, y_test)
y_train_with_features = np.concatenate((y_train, additional_features[:len(y_train)]), axis=1)
y_test_with_features = np.concatenate((y_test, additional_features[len(y_train):]), axis=1)

In [32]:
y_train_with_features[:,0]

array([192.75    , 186.190002, 171.559998, 167.449997, 174.789993,
       185.559998, 173.559998, 163.759995, 173.      , 192.580002,
       176.300003, 193.419998, 163.759995, 170.690002, 175.009995,
       190.690002, 175.839996, 174.490005, 189.429993, 176.570007,
       189.369995, 174.      , 172.570007, 165.229996, 175.050003,
       179.070007, 177.300003, 193.580002, 179.800003, 180.710007,
       183.789993, 171.210007, 189.690002, 175.429993, 177.970001,
       179.360001, 173.570007, 177.149994, 177.229996, 177.559998,
       164.899994, 193.130005, 179.580002, 193.050003, 185.639999,
       195.610001, 189.710007, 175.160004, 192.529999, 188.059998,
       169.589996, 175.460007, 181.119995, 172.070007, 165.210007,
       181.179993, 187.440002, 164.660004, 173.5     , 175.490005,
       165.630005, 172.070007, 172.880005, 171.100006, 189.699997,
       173.440002, 165.559998, 189.25    , 178.179993, 166.889999,
       189.970001, 160.770004, 185.589996, 179.229996, 188.080

In [39]:
# Select only the first column (y_train) from y_train_with_features
endog_variable = y_train_with_features
# Create the ARIMA model using the univariate endogenous variable
arima_model_feature = ARIMA(endog_variable ,order=(5,1,0))
# Fit the ARIMA model
arima_model_fit_feature = arima_model_feature.fit()

ValueError: SARIMAX models require univariate `endog`. Got shape (160, 4).

In [40]:
# Forecasting
forecast_steps = len(y_test_with_features)  # Forecast for the length of the test set
forecast_feature = arima_model_fit_feature.forecast(steps=forecast_steps)

In [41]:
forecast_feature

array([ -162.95449705,  -110.05516624,  -200.932584  ,  -288.99702457,
        -261.0201476 ,  -378.47416069,  -409.42046932,  -431.77540424,
        -531.79127994,  -542.4946542 ,  -604.02317124,  -669.96075434,
        -691.59976754,  -765.6076946 ,  -806.25424276,  -849.43603008,
        -915.04814726,  -948.70080886, -1006.74539441, -1057.81047518,
       -1097.80436406, -1158.40499115, -1200.15958676, -1249.60561527,
       -1304.47836436, -1345.2463357 , -1399.95731156, -1447.82872958,
       -1492.7162826 , -1547.00259901, -1591.19184282, -1640.50859062,
       -1691.22223173, -1735.63765775, -1786.83253137, -1834.10668064,
       -1880.68973747, -1931.09970086, -1976.83602994, -2025.29203572])

In [42]:
forecast_feature.shape

(40,)

In [43]:
# Forecast T+1
forecast_steps = 1
forecast_T1_feature = arima_model_fit_feature.forecast(steps=forecast_steps)
print("Forecasts for the T+1:")
print(forecast_T1_feature)

Forecasts for the T+1:
[-162.95449705]


In [44]:
# Forecast T+7
forecast_steps = 7
forecast_T1_feature = arima_model_fit_feature.forecast(steps=forecast_steps)
print("Forecasts for the T+7:")
print(forecast_T1_feature)

Forecasts for the T+7:
[-162.95449705 -110.05516624 -200.932584   -288.99702457 -261.0201476
 -378.47416069 -409.42046932]


In [45]:
# Calculate evaluation metrics for test set
rmse_test = np.sqrt(mean_squared_error(y_test, forecast_feature))
mape_test = mean_absolute_percentage_error(y_test, forecast_feature)
mae_test = np.mean(np.abs(y_test - forecast_feature))

# Print metrics for test set
print("Test Set Metrics:")
print(f'Root Mean Squared Error (RMSE): {rmse_test}')
print(f'Mean Absolute Percentage Error (MAPE): {mape_test}')
print(f'Mean Absolute Error (MAE): {mae_test}')

Test Set Metrics:
Root Mean Squared Error (RMSE): 1379.4601011531843
Mean Absolute Percentage Error (MAPE): 690.0325898232641
Mean Absolute Error (MAE): 1256.875327527775
