In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor  # Import Random Forest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from transformers import TFBertForSequenceClassification, BertTokenizerFast
from gensim.models import Word2Vec

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Reads data
df_stock = pd.read_csv(r"C:\Users\ASatl\OneDrive\Desktop\let\tweets.csv")
df_stock = df_stock[-200:]
df_stock

Unnamed: 0,Date,Open,High,Low,Close
0,05/03/2024,2313.50,2330.70,2285.20,2308.60
1,05/02/2024,2329.90,2336.10,2294.30,2309.60
2,05/01/2024,2298.20,2339.50,2291.70,2311.00
3,04/30/2024,2347.00,2347.60,2296.20,2302.90
4,04/29/2024,2347.60,2358.90,2331.00,2357.70
...,...,...,...,...,...
187,08/07/2023,1977.60,1981.70,1966.10,1970.00
188,08/04/2023,1969.60,1984.20,1954.50,1976.10
189,08/03/2023,1970.80,1974.50,1964.50,1968.80
190,08/02/2023,1988.90,1992.20,1969.10,1975.00


In [3]:
# Convert the 'Date' column to datetime format
df_stock['Date'] = pd.to_datetime(df_stock['Date'])

In [4]:
# Extracting the target variable 'Close' (dependent variable) from the DataFrame
y = df_stock['Close']

# Extracting the feature 'Open' (independent variable) from the DataFrame
X = df_stock['Open']

In [5]:
# Reshape the data as RandomForestRegressor expects a 2D array
X = X.values.reshape(-1, 1)
y = y.values

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
scaler_x = StandardScaler()
scaler_y = StandardScaler()

scaler_x_feature = StandardScaler()
scaler_y_feature = StandardScaler()

In [8]:
# Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42) 
X_train_scaled = scaler_x.fit_transform(X_train)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
rf_model.fit(X_train_scaled, y_train_scaled)

ValueError: could not convert string to float: '2,051.40'

In [None]:
# Forecast for the next day
# Assuming the last row of the dataset represents the latest data
last_day_open = df_stock['Open'].iloc[-1].reshape(-1, 1)
next_day_forecast_scaled = rf_model.predict(last_day_open)
next_day_forecast_unscaled = scaler_y.inverse_transform(next_day_forecast_scaled.reshape(-1,1)).flatten()

print("The next day prediction is: ", next_day_forecast_unscaled[0])

In [None]:
# Initialize an array to store forecasts for the next 6 days
week_forecasts = []

In [None]:
# Forecast for the next 6 days
for i in range(1, 7):
    # Use the predicted value of the previous day as the 'Open' value for forecasting the next day
    next_day_open = next_day_forecast_scaled.reshape(-1, 1)
    next_day_forecast_scaled = rf_model.predict(next_day_open)
    next_day_forecast_unscaled = scaler_y.inverse_transform(next_day_forecast_scaled.reshape(-1,1)).flatten()
    # Append the forecast to the array
    week_forecasts.append(next_day_forecast_unscaled[0])

In [None]:
# The 'week_forecasts' array now contains the forecasts for the entire week
print("7th Day Forecasts:", week_forecasts[-1])

In [None]:
# Make predictions on the test set
X_test_scaled = scaler_x.transform(X_test) 
y_pred_scaled = rf_model.predict(X_test_scaled)

In [None]:
# Inverse transform the predictions to get them back to the original scale
y_pred_scaled = y_pred_scaled.reshape(-1,1)
y_pred = scaler_y.inverse_transform(y_pred_scaled)

In [None]:
# Calculate evaluation metrics
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Accuracy without NLP")
print(f'Mean Absolute Percentage Error (MAPE): {mape}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

In [None]:
df_Tweets = pd.read_csv(r"C:\Users\ASatl\OneDrive\Desktop\let\tweets.csv")
df_Tweets

In [None]:
# Remove rows with missing values in the 'Message' column
df_Tweets = df_Tweets.dropna(subset=['Message'])

In [None]:
# Sentiment analysis using Word2Vec
tweets_texts = df_Tweets['Message'].tolist()
tweets_tokens = [tweet_text.split() for tweet_text in tweets_texts]
Word2Vec_model = Word2Vec(tweets_tokens, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Load pre-trained FinBERT model and tokenizer
finbert_model = TFBertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
finbert_tokenizer = BertTokenizerFast.from_pretrained('yiyanghkust/finbert-tone')

In [None]:
# Tokenize tweet texts using FinBERT tokenizer
tokenized_tweets = finbert_tokenizer(tweets_texts, padding=True, truncation=True, return_tensors="tf")

In [None]:
# Obtain predictions from FinBERT model
finbert_predictions = finbert_model.predict(tokenized_tweets)

In [None]:
# Extract sentiment scores from predictions
positive_scores = finbert_predictions[0][:, 0]  # Positive sentiment score
negative_scores = finbert_predictions[0][:, 1]  # Negative sentiment score
neutral_scores = finbert_predictions[0][:, 2]   # Neutral sentiment score

In [None]:
# Concatenate sentiment scores as additional features
additional_features = np.column_stack((positive_scores, negative_scores, neutral_scores))

In [None]:
# Concatenate additional features with existing features (X_train, X_test)
X_train_with_features = np.concatenate((X_train, additional_features[:len(X_train)]), axis=1)
X_test_with_features = np.concatenate((X_test, additional_features[len(X_train):]), axis=1)

In [None]:
# Create and train the Random Forest model
rf_model_feature = RandomForestRegressor(n_estimators=100, random_state=42)  
X_train_scaled_feature = scaler_x_feature.fit_transform(X_train_with_features)
y_train_scaled_feature = scaler_y_feature.fit_transform(y_train.reshape(-1, 1)).flatten()
rf_model_feature.fit(X_train_scaled_feature, y_train_scaled_feature)

In [None]:
# Reshape df_stock['Close'] to match the dimensions of additional_features
close_column = df_stock['Close'].values.reshape(-1, 1)

# Concatenate close_column with additional_features
data_stock_feature = np.concatenate((close_column, additional_features[:len(close_column)]), axis=1)
data_stock_feature.shape

In [None]:
# Forecast for the next day
# Assuming the last row of the dataset represents the latest data
last_day_open = data_stock_feature
next_day_forecast_feature_scaled = rf_model_feature.predict(last_day_open)
next_day_forecast_feature_unscaled = scaler_y_feature.inverse_transform(next_day_forecast_feature_scaled.reshape(-1,1)).flatten()

print("The next day prediction is: ", next_day_forecast_feature_unscaled[0])

In [None]:
# Initialize an array to store forecasts for the next 6 days
week_forecasts = []

# Forecast for the next 6 days
for i in range(1, 7):
    # Use the predicted value of the previous day as the 'Open' value for forecasting the next day
    next_day_open = next_day_forecast_feature_scaled.reshape(-1, 1)
    next_day_forecast_feature_scaled = rf_model.predict(next_day_open)
    next_day_forecast_feature_unscaled = scaler_y_feature.inverse_transform(next_day_forecast_feature_scaled.reshape(-1,1)).flatten()
     # Append the forecast to the array
    week_forecasts.append(next_day_forecast_feature_unscaled[0])

    # The 'week_forecasts' array now contains the forecasts for the entire week
print("7th Day Forecasts:", week_forecasts[-1])

In [None]:
# Make predictions on the test set
X_test_scaled_feature = scaler_x_feature.transform(X_test_with_features) 
y_pred_scaled_feature = rf_model_feature.predict(X_test_scaled_feature)

In [None]:
# Inverse transform the predictions to get them back to the original scale
y_pred_scaled_feature = y_pred_scaled_feature.reshape(-1,1)
y_pred_with_feature = scaler_y_feature.inverse_transform(y_pred_scaled_feature)

In [None]:
y_pred_with_feature.shape

In [None]:
# Calculate evaluation metrics
mape = np.mean(np.abs((y_test - y_pred_with_feature) / y_test)) * 100
mae = np.mean(np.abs(y_test - y_pred_with_feature))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_with_feature))

print("Accuracy with NLP")
print(f'Mean Absolute Percentage Error (MAPE): {mape}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')