In [None]:
#Import required Python libraries
import pandas as pd
import os
import numpy as np


In [None]:
#Imports each dataset from a file into Random Access Memory
dfNews = pd.read_csv('/home/nickregas/Desktop/IndependentCompsciProject/Data/All_external.csv')
dfStock = pd.read_csv('/home/nickregas/Desktop/IndependentCompsciProject/Data/VTI.csv')

In [None]:
#Finds the Daily Change of each day and gets rid of unecessary columns
dfStock['DailyChange'] = dfStock['close'] - dfStock['open']
dfStock = dfStock.drop(['open', 'close', 'volume', 'high', 'low', 'adj close'], axis=1)
dfNews = dfNews.drop(['Stock_symbol', 'Url', 'Publisher', 'Author', 'Article', 'Lsa_summary', 'Luhn_summary', 'Textrank_summary', 'Lexrank_summary'], axis=1)


In [None]:
#Standardizes the date in order to combine the two dataset
dfNews['Date'] = dfNews['Date'].str.split(' ').str[0]
dfNews = dfNews.rename(columns={'Date': 'date'})
merged_df = pd.merge(dfNews[['date', 'Article_title']], dfStock[['date', 'DailyChange']], on='date', how='inner')

In [None]:
# Define output path and filename in order to save the combined dataset as a csv file
output_dir = "/media/nickregas/USB OG/CompSciData"
os.makedirs(output_dir, exist_ok=True)
output_filename = os.path.join(output_dir, "DataFile1.csv")
merged_df.to_csv(output_filename, index=False)

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from tqdm.auto import tqdm
import os

#Tries to use graphics card but uses cpu if not able to
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    print("Using device:", torch.cuda.get_device_name(0))
else:
    print("Using device: CPU")

#Defines the specific model for use in the SentenceTransformer library
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

#This method reduces the number of rows in a dataset to a specified fraction while also balancing the amount of dates that a stock increases and decreases to avoid a model always guessing one value
def balance_sample(df, sample_frac=0.3):
    with tqdm(total=4, desc="Balancing Data") as pbar:
        pos = df[df['DailyChange'] > 0]
        neg = df[df['DailyChange'] <= 0]
        pbar.update(1)

        n_samples = int(min(len(pos), len(neg)) * sample_frac)
        pos_sampled = pos.sample(n_samples, random_state=42)
        neg_sampled = neg.sample(n_samples, random_state=42)
        pbar.update(1)

        balanced_df = pd.concat([pos_sampled, neg_sampled]).sample(frac=1, random_state=42)
        pbar.update(1)

        pbar.set_postfix({"Samples": len(balanced_df), "Pos/Neg Ratio": "{}/{}".format(n_samples, n_samples)})
        pbar.update(1)

    return balanced_df

balanced_df = balance_sample(merged_df)


article_titles = balanced_df['Article_title'].tolist()
embeddings = []

batch_size = 128
#This loop converts the Strings to vectors using the model specified above.
with tqdm(total=len(article_titles), desc="Generating Embeddings") as progress_bar:
    for start in range(0, len(article_titles), batch_size):
        end = start + batch_size
        batch = article_titles[start:end]
        encoded = model.encode(batch, show_progress_bar=False)
        embeddings.append(encoded)
        progress_bar.update(len(batch))

embeddings = np.vstack(embeddings)

#This saves the dataset into multiple parquet files which are easier to train a model on.
output_path = '/media/nickregas/USB OG/CompSciData/sbert_embeddings.parquet'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with tqdm(total=3, desc="Saving Parquet File") as pbar:
    df_output = pd.DataFrame(
        embeddings,
        columns=["vec_{}".format(i) for i in range(embeddings.shape[1])]
    )
    pbar.update(1)

    df_output['DailyChange'] = balanced_df['DailyChange'].values
    pbar.update(1)

    df_output.to_parquet(output_path, index=False)
    pbar.update(1)
    pbar.set_postfix({"Path": output_path})

print()
print("Saved " + str(len(df_output)) + " vectors (dim=" + str(embeddings.shape[1]) + ") to " + output_path)

In [None]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import xgboost
from sklearn.metrics import confusion_matrix
import os
from tqdm.auto import tqdm


os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print("GPU setup complete")

#Retrives data from the parquest files and loads from each one
print("\nLoading data...")
parquet_files = sorted(glob.glob('/media/nickregas/USB OG/CompSciData/ParquetFiles/embeddings_*.parquet'))
df_list = []
for file in tqdm(parquet_files, desc='Loading Files'):
    df_list.append(pd.read_parquet(file))
df = pd.concat(df_list)

#Converts a positive change to increase or True and does the opposite for a negative change
X = df.filter(like='vec_')
y = (df['DailyChange'] > 0).astype(int)

#Splits the data into train and test batches
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


print("\nTraining model using GPU...")
#The following class assists in creating a visual progress bar to view how quickly the model is training
class ProgressCallback(xgboost.callback.TrainingCallback):
    def __init__(self):
        self.pbar = None

    def before_training(self, model):
        self.pbar = tqdm(total=model.num_boosted_rounds(), desc='Training')
        return model

    def after_iteration(self, model, epoch, evals_log):
        self.pbar.update(1)
        return False

    def after_training(self, model):
        self.pbar.close()
        return model

#Defined the model using the XGBClassifier and with customized parameters
model = XGBClassifier(
    n_estimators=1500,
    max_depth=10,
    learning_rate=0.08,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    subsample=0.85,
    colsample_bytree=0.85,
    gamma=1,
    reg_alpha=0,
    reg_lambda=0.3,
    min_child_weight=3,
    random_state=42,
    eval_metric='logloss',
    early_stopping_rounds=40,
    callbacks=[ProgressCallback()]
)
#Training method
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=0
)

#Saves model as a JSON layout which allows us to re-use the model without re-training
output_dir = '/media/nickregas/USB OG/CompSciData/'
model_path = os.path.join(output_dir, 'gpu_stock_model2.json')
model.save_model(model_path)
print("\nModel saved to " + model_path)


#Generate a matrix to help visualize the models performance
plt.figure(figsize=(8, 6))
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Down', 'Predicted Up'],
            yticklabels=['Actual Down', 'Actual Up'])

plt.title('GPU Stock Prediction Performance')
plt.tight_layout()

#Saves matrix
conf_matrix_path = os.path.join(output_dir, 'gpu_confusion_matrix2.png')
plt.savefig(conf_matrix_path, dpi=120)
print("Confusion matrix image saved to " + conf_matrix_path)

print("\nPipeline finished successfully.")


