In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from transformers import TFBertForSequenceClassification, BertTokenizerFast
from gensim.models import Word2Vec

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Function to calculate mean absolute percentage error
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
df_stock = pd.read_csv(r"C:\Users\ASatl\OneDrive\Desktop\let\AAPL.csv")
df_stock = df_stock[-200:]
df_stock

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
5846,2023-03-29,159.369995,161.050003,159.350006,160.770004,160.120499,51305700
5847,2023-03-30,161.529999,162.470001,161.270004,162.360001,161.704056,49501700
5848,2023-03-31,162.440002,165.000000,161.910004,164.899994,164.233795,68749800
5849,2023-04-03,164.270004,166.289993,164.220001,166.169998,165.498672,56976200
5850,2023-04-04,166.600006,166.839996,165.110001,165.630005,164.960846,46278300
...,...,...,...,...,...,...,...
6041,2024-01-08,182.089996,185.600006,181.500000,185.559998,185.559998,59144500
6042,2024-01-09,183.919998,185.149994,182.729996,185.139999,185.139999,42841800
6043,2024-01-10,184.350006,186.399994,183.919998,186.190002,186.190002,46792900
6044,2024-01-11,186.539993,187.050003,183.619995,185.589996,185.589996,49128400


In [4]:
# Convert the 'Date' column to datetime format
df_stock['Date'] = pd.to_datetime(df_stock['Date'])
df_stock['Date']

5846   2023-03-29
5847   2023-03-30
5848   2023-03-31
5849   2023-04-03
5850   2023-04-04
          ...    
6041   2024-01-08
6042   2024-01-09
6043   2024-01-10
6044   2024-01-11
6045   2024-01-12
Name: Date, Length: 200, dtype: datetime64[ns]

In [5]:
# Extracting the target variable 'Close' (dependent variable) from the DataFrame
y = df_stock['Close']

# Extracting the feature 'Open' (independent variable) from the DataFrame
X = df_stock['Open']


In [6]:
# Reshape the data as MLPRegressor expects a 2D array
X = X.values.reshape(-1, 1)
y = y.values

In [36]:
X

array([[159.369995],
       [161.529999],
       [162.440002],
       [164.270004],
       [166.600006],
       [164.740005],
       [162.429993],
       [161.419998],
       [162.350006],
       [161.220001],
       [161.630005],
       [164.589996],
       [165.089996],
       [166.100006],
       [165.800003],
       [166.089996],
       [165.050003],
       [165.      ],
       [165.190002],
       [163.059998],
       [165.190002],
       [168.490005],
       [169.279999],
       [170.089996],
       [169.5     ],
       [164.889999],
       [170.979996],
       [172.479996],
       [173.050003],
       [173.020004],
       [173.850006],
       [173.619995],
       [173.160004],
       [171.990005],
       [171.710007],
       [173.      ],
       [176.389999],
       [173.979996],
       [173.130005],
       [171.089996],
       [172.410004],
       [173.320007],
       [176.960007],
       [177.330002],
       [177.699997],
       [181.029999],
       [182.630005],
       [179.9

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler_x = StandardScaler()
scaler_y = StandardScaler()

scaler_x_feature = StandardScaler()
scaler_y_feature = StandardScaler()

In [9]:
df_Tweets = pd.read_csv(r"C:\Users\ASatl\OneDrive\Desktop\let\tweets.csv")
df_Tweets

Unnamed: 0,No.,Message
0,1.0,Excited to see what's next for $AAPL in the te...
1,2.0,$AAPL's stock performance reflects its solid f...
2,3.0,Wondering how $AAPL will capitalize on the lat...
3,4.0,$AAPL's innovation engine never ceases to amaze.
4,5.0,Anticipating the impact of $AAPL's upcoming pr...
...,...,...
201,,
202,,
203,,
204,,


In [10]:
# Remove rows with missing values in the 'Message' column
df_Tweets = df_Tweets.dropna(subset=['Message'])

In [11]:
# Sentiment analysis using Word2Vec
tweets_texts = df_Tweets['Message'].tolist()
tweets_tokens = [tweet_text.split() for tweet_text in tweets_texts]
Word2Vec_model = Word2Vec(tweets_tokens, vector_size=100, window=5, min_count=1, workers=4)

In [12]:
# Load pre-trained FinBERT model and tokenizer
finbert_model = TFBertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
finbert_tokenizer = BertTokenizerFast.from_pretrained('yiyanghkust/finbert-tone')




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at yiyanghkust/finbert-tone.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [13]:
# Tokenize tweet texts using FinBERT tokenizer
tokenized_tweets = finbert_tokenizer(tweets_texts, padding=True, truncation=True, return_tensors="tf")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
# Obtain predictions from FinBERT model
finbert_predictions = finbert_model.predict(tokenized_tweets)




In [42]:
finbert_predictions

TFSequenceClassifierOutput(loss=None, logits=array([[ -1.6455983 ,   7.7009273 ,  -9.36854   ],
       [ -7.143957  ,  12.671352  ,  -5.8195057 ],
       [  3.1764252 ,  -3.6213303 ,  -2.2580187 ],
       [  1.3676811 ,  -3.2863476 ,   3.278767  ],
       [  7.140092  ,  -5.9732103 ,  -4.247523  ],
       [  0.49003828,   0.5343213 ,  -4.0019054 ],
       [ -7.5047336 ,  13.220085  ,  -6.9881773 ],
       [ -6.605027  ,  11.46611   ,  -6.449324  ],
       [ -1.6974301 ,  -3.8269472 ,   9.344115  ],
       [ -0.16802241,   1.476324  ,  -4.0699983 ],
       [ -6.0637956 ,  11.975257  ,  -7.9659758 ],
       [  1.2053092 ,  -2.6536217 ,  -2.489853  ],
       [  5.6234546 ,  -4.3435616 ,  -2.3808713 ],
       [ -2.496433  ,   6.7538376 ,  -7.145686  ],
       [ -2.9782917 ,   7.927046  ,  -7.0531363 ],
       [  3.5192962 ,  -6.099468  ,   1.0944744 ],
       [  6.611659  ,  -5.7259917 ,  -3.5743484 ],
       [ -4.2252584 ,   7.5822816 ,  -6.297868  ],
       [  5.1665053 ,  -6.228522  ,  

In [38]:
# Extract sentiment scores from predictions
positive_scores = finbert_predictions[0][:, 0]  # Positive sentiment score
negative_scores = finbert_predictions[0][:, 1]  # Negative sentiment score
neutral_scores = finbert_predictions[0][:, 2]   # Neutral sentiment score

print(f"Postive scores: {positive_scores[-10:]}")
print(f"Negative scores: {negative_scores[-10:]}")
print(f"Neutral scores: {neutral_scores[-10:]}")

Postive scores: [ 1.2708416 -0.6676626  3.7765489 -4.290741  -5.382528  -0.8940015
 -3.5379117  2.3110476 -5.475395  -5.305478 ]
Negative scores: [ 4.1959977   4.6228714  -0.14867342  9.493037   10.247296    5.7470655
  6.7671456   1.7550515  10.338042    9.982049  ]
Neutral scores: [-6.850038  -5.282077  -5.386553  -6.3651724 -5.2787013 -6.41382
 -5.9507504 -5.6000347 -5.509652  -5.1557927]


In [43]:
# Concatenate sentiment scores as additional features
additional_features = np.column_stack((positive_scores, negative_scores, neutral_scores))
print(f"Additional Features: {additional_features[-10:]}")

Additional Features: [[ 1.2708416   4.1959977  -6.850038  ]
 [-0.6676626   4.6228714  -5.282077  ]
 [ 3.7765489  -0.14867342 -5.386553  ]
 [-4.290741    9.493037   -6.3651724 ]
 [-5.382528   10.247296   -5.2787013 ]
 [-0.8940015   5.7470655  -6.41382   ]
 [-3.5379117   6.7671456  -5.9507504 ]
 [ 2.3110476   1.7550515  -5.6000347 ]
 [-5.475395   10.338042   -5.509652  ]
 [-5.305478    9.982049   -5.1557927 ]]


In [17]:
#1
# Create and train the SVR model
svr_model = SVR(kernel="rbf")
X_train_scaled = scaler_x.fit_transform(X_train)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
svr_model.fit(X_train_scaled, y_train_scaled)

In [18]:
# Make predictions on the test set
X_test_scaled = scaler_x.transform(X_test.reshape(-1,1))
y_pred_scaled = svr_model.predict(X_test_scaled)

In [19]:
# Inverse transform the predictions to get them back to the original scale
y_pred_scaled = y_pred_scaled.reshape(-1, 1)  # Reshape to 2D array
y_pred = scaler_y.inverse_transform(y_pred_scaled)

In [20]:
# Make predictions on the test set
X_test_scaled = scaler_x.transform(X_test.reshape(-1,1))
y_pred_scaled = svr_model.predict(X_test_scaled)

In [21]:
# Inverse transform the predictions to get them back to the original scale
y_pred_scaled = y_pred_scaled.reshape(-1, 1)  # Reshape to 2D array
y_pred = scaler_y.inverse_transform(y_pred_scaled)

In [22]:
# Forecast for the next day
# Assuming the last row of the dataset represents the latest data
last_day_open = df_stock['Close'].iloc[-1].reshape(-1, 1)
next_day_forecast_scaled = svr_model.predict(scaler_x.transform(last_day_open))
next_day_forecast_unscaled = scaler_y.inverse_transform(next_day_forecast_scaled.reshape(-1, 1))

print("The next day prediction is: ", next_day_forecast_unscaled[0, 0])

The next day prediction is:  186.1867657489219


In [23]:
# Initialize an array to store forecasts for the next 6 days
week_forecasts = []

# Forecast for the next 6 days
for i in range(1, 7):
    # Use the predicted value of the previous day as the 'Open' value for forecasting the next day
    next_day_open = next_day_forecast_unscaled.reshape(-1, 1)
    next_day_forecast_scaled = svr_model.predict(scaler_x.transform(next_day_open))
    next_day_forecast_unscaled = scaler_y.inverse_transform(next_day_forecast_scaled.reshape(-1, 1))

    # Append the forecast to the array
    week_forecasts.append(next_day_forecast_unscaled[0, 0])

# The 'week_forecasts' array now contains the forecasts for the entire week
print("7th Day Forecasts:", week_forecasts[len(week_forecasts)-1])

7th Day Forecasts: 187.84655348190023


In [24]:
# Calculate evaluation metrics
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Mean Absolute Percentage Error (MAPE): {mape}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Percentage Error (MAPE): 6.330850927809298
Mean Absolute Error (MAE): 11.406097673363343
Root Mean Squared Error (RMSE): 1.759733384531049


In [25]:
# Concatenate additional features with existing features (X_train, X_test)
X_train_with_features = np.concatenate((X_train, additional_features[:len(X_train)]), axis=1)
X_test_with_features = np.concatenate((X_test, additional_features[len(X_train):]), axis=1)

In [26]:
X_train_with_features.shape

(160, 4)

In [27]:
X_train.shape

(160, 1)

In [28]:
X_train_with_features.shape

(160, 4)

In [29]:
# Create and train the SVR model with features
svr_model_feature = SVR(kernel="rbf")
X_train_scaled_feature = scaler_x_feature.fit_transform(X_train_with_features)
y_train_scaled_feature = scaler_y_feature.fit_transform(y_train.reshape(-1, 1)).flatten()
svr_model_feature.fit(X_train_scaled_feature, y_train_scaled_feature)

In [30]:
# Make predictions on the test set
X_test_scaled_feature = scaler_x_feature.transform(X_test_with_features) 
y_pred_scaled_feature = svr_model_feature.predict(X_test_scaled_feature)

In [31]:
# Inverse transform the predictions to get them back to the original scale
y_pred_scaled_feature = y_pred_scaled_feature.reshape(-1, 1)  # Reshape to 2D array
y_pred_with_feature = scaler_y_feature.inverse_transform(y_pred_scaled_feature)

In [32]:
# Reshape df_stock['Close'] to match the dimensions of additional_features
close_column = df_stock['Close'].values.reshape(-1, 1)

# Concatenate close_column with additional_features
data_stock_feature = np.concatenate((close_column, additional_features[:len(close_column)]), axis=1)
data_stock_feature.shape

(200, 4)

In [33]:
# Forecast for the next day
# Assuming the last row of the dataset represents the latest data
last_day_open = data_stock_feature
last_day_open_reshaped = np.array(last_day_open)
next_day_forecast_scaled = svr_model_feature.predict(scaler_x_feature.transform(last_day_open_reshaped))
next_day_forecast_unscaled = scaler_y_feature.inverse_transform(next_day_forecast_scaled.reshape(-1, 1))

print("The next day prediction is: ", next_day_forecast_unscaled[0, 0])

The next day prediction is:  166.58907115382306


In [34]:
# Initialize an array to store forecasts for the next 6 days
week_forecasts = []

# Forecast for the next 6 days
for i in range(1, 7):
    # Use the predicted value of the previous day as the 'Open' value for forecasting the next day
    next_day_open = next_day_forecast_unscaled.reshape(-1, 1)
    next_day_forecast_scaled = svr_model.predict(scaler_x.transform(next_day_open))
    next_day_forecast_unscaled = scaler_y.inverse_transform(next_day_forecast_scaled.reshape(-1, 1))

    # Append the forecast to the array
    week_forecasts.append(next_day_forecast_unscaled[0, 0])

# The 'week_forecasts' array now contains the forecasts for the entire week
print("7th Day Forecasts:", week_forecasts[len(week_forecasts)-1])

7th Day Forecasts: 169.15804394286536


In [35]:
# Calculate evaluation metrics
mape = np.mean(np.abs((y_test - y_pred_with_feature) / y_test)) * 100
mae = np.mean(np.abs(y_test - y_pred_with_feature))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_with_feature))

print(f'Mean Absolute Percentage Error (MAPE): {mape}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Mean Absolute Percentage Error (MAPE): 6.1487159624976595
Mean Absolute Error (MAE): 11.08256463204729
Root Mean Squared Error (RMSE): 2.0084400704628296
