In [None]:
import pandas as pd
import duckdb
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import os 
from dotenv import load_dotenv
from pathlib import Path
load_dotenv()

In [None]:
DB_PATH = Path(os.getenv("DB_PATH"))
DB_FILE = os.getenv("DB_FILE")
duckdb_path = DB_PATH / DB_FILE

In [None]:
con = duckdb.connect(duckdb_path, read_only=True)
df = con.execute("SELECT * FROM Headlines.weekly_training_data").fetchdf()
con.close()

In [3]:
THRESHOLD = 2.5

df["Target_t_7"] = 0  # Default to 'No Movement'
df.loc[df["price_change_t_7"] >= THRESHOLD, "Target_t_7"] = 1  # Surge
df.loc[df["price_change_t_7"] <= -THRESHOLD, "Target_t_7"] = -1  # Drop

df["Target_t_7"].value_counts() # honestly this is pretty balanced

Target_t_7
 0    169018
 1     16569
-1     15480
Name: count, dtype: int64

In [None]:
# drop rows that have -1 for price_t, price_t_7_past, volume_t, volume_t_7_past
print(f"Before removing -1s {df.shape}")
filtered_df = df[(df["price_t"] != -1) & (df["price_t_7_past"] != -1) & (df["volume_t"] != -1) & (df["volume_t_7_past"] != -1)]
print(f"After removing -1s {filtered_df.shape}")

# remove everything with subindustry as null 
# print(f"Before removing nulls {filtered_df.shape}")
# filtered_df = filtered_df[filtered_df["subindustry"].notnull()]
# print(f"After removing nulls {filtered_df.shape}")

Before removing -1s (201067, 18)
After removing -1s (198113, 18)
Before removing nulls (198113, 18)
After removing nulls (198113, 18)


In [5]:
filtered_df["Target_t_7"].value_counts() # honestly this is pretty balanced

Target_t_7
 0    166074
 1     16565
-1     15474
Name: count, dtype: int64

In [6]:
# see what rows have NaN values 
filtered_df[filtered_df.isna().any(axis=1)]
# okay looks like we have one weird case where no volume data for BIIB @ 2023-06-12 so its NaN??? lets make it 0 I guess 
filtered_df["volume_change_t_7"] = filtered_df["volume_change_t_7"].fillna(0)
# filtered_df[filtered_df.isna().any(axis=1)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["volume_change_t_7"] = filtered_df["volume_change_t_7"].fillna(0)


handle categorical

In [7]:
categorical_features = ["ticker", "subindustry"]

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(filtered_df[categorical_features])
encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_features))

In [8]:
processed_df = filtered_df.drop(columns=categorical_features)
processed_df = processed_df.drop(columns=["guid", "sentiment_label_t"])
# processed_df = pd.concat([processed_df, encoded_df], axis=1)
processed_df.columns

Index(['date_t', 'vix_t', 'vix_t_7_past', 'vix_t_7_future', 'price_t',
       'price_t_7_past', 'price_change_t_7', 'volume_t', 'volume_t_7_past',
       'volume_change_t_7', 'sentiment_positive_t', 'sentiment_neutral_t',
       'sentiment_negative_t', 'Target_t_7'],
      dtype='object')

In [9]:
scaler = StandardScaler()

date_col = processed_df["date_t"].reset_index(drop=True)  # Reset index
target_col = processed_df["Target_t_7"].reset_index(drop=True)  # Reset index
processed_df_no_date = processed_df.drop(columns=["date_t", "Target_t_7"]).reset_index(drop=True)

scaled_features = scaler.fit_transform(processed_df_no_date)
scaled_df = pd.DataFrame(scaled_features, columns=processed_df_no_date.columns)
print(scaled_df[scaled_df.isna().any(axis=1)].size)
if date_col.index.equals(scaled_df.index) and target_col.index.equals(scaled_df.index):
    scaled_df = pd.concat([date_col, scaled_df, target_col], axis=1)    
    print(scaled_df.columns)
else:
    raise ValueError("Index mismatch")

0
Index(['date_t', 'vix_t', 'vix_t_7_past', 'vix_t_7_future', 'price_t',
       'price_t_7_past', 'price_change_t_7', 'volume_t', 'volume_t_7_past',
       'volume_change_t_7', 'sentiment_positive_t', 'sentiment_neutral_t',
       'sentiment_negative_t', 'Target_t_7'],
      dtype='object')


In [10]:
final_df = pd.concat([scaled_df, encoded_df], axis=1)
final_df.head()

Unnamed: 0,date_t,vix_t,vix_t_7_past,vix_t_7_future,price_t,price_t_7_past,price_change_t_7,volume_t,volume_t_7_past,volume_change_t_7,...,subindustry_60106010,subindustry_60106020,subindustry_60107010,subindustry_60108010,subindustry_60108020,subindustry_60108030,subindustry_60108040,subindustry_60108050,subindustry_60201040,subindustry_None
0,2024-07-22,-0.440606,-0.419025,0.742287,0.045056,0.044638,0.190594,-0.186053,-0.175984,-0.044958,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-07-22,-0.440606,-0.419025,0.742287,-0.026319,-0.026421,0.474186,-0.021495,-0.004783,-0.027092,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-07-22,-0.440606,-0.419025,0.742287,-0.021417,-0.021351,-0.20059,-0.017216,-0.02453,-0.014323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2024-07-22,-0.440606,-0.419025,0.742287,-0.033713,-0.03372,0.355548,1.980879,2.274551,-0.032749,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-07-22,-0.440606,-0.419025,0.742287,-0.030759,-0.030782,0.235853,-0.113726,-0.13452,0.022861,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# find unique dates 
final_df = final_df.sort_values(by="date_t", ascending=True)
dates = final_df["date_t"].unique()
print(len(dates) * 0.8)
print(dates[108])

# so the middle is at index 108 
train_dates = dates[:108]
test_dates = dates[108:]

train_df = final_df[final_df["date_t"].isin(train_dates)]
test_df = final_df[final_df["date_t"].isin(test_dates)]

X_train = train_df.drop(columns=["Target_t_7", "date_t"])
y_train = train_df["Target_t_7"]

X_test = test_df.drop(columns=["Target_t_7", "date_t"])
y_test = test_df["Target_t_7"]

107.2
2024-01-29 00:00:00


In [12]:
model = LogisticRegression(class_weight="balanced", max_iter=1000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.94      1.00      0.97      2784
           0       1.00      0.99      0.99     41234
           1       0.94      1.00      0.97      3861

    accuracy                           0.99     47879
   macro avg       0.96      1.00      0.98     47879
weighted avg       0.99      0.99      0.99     47879



In [13]:
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 2784,     0,     0],
       [  189, 40789,   256],
       [    0,    11,  3850]])

In [14]:

y_train_fixed = y_train + 1
y_test_fixed = y_test + 1


# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",  # Multi-class classification
    num_class=3,  # Three classes (-1, 0, 1)
    eval_metric="mlogloss",  # Multi-class log loss
    use_label_encoder=False,
    seed=42
)

# Train the model
xgb_model.fit(X_train, y_train_fixed)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate model
classification_report(y_test_fixed, y_pred_xgb)

Parameters: { "use_label_encoder" } are not used.



'              precision    recall  f1-score   support\n\n           0       0.98      0.99      0.98      2784\n           1       1.00      1.00      1.00     41234\n           2       0.98      0.99      0.98      3861\n\n    accuracy                           1.00     47879\n   macro avg       0.98      0.99      0.99     47879\nweighted avg       1.00      1.00      1.00     47879\n'

In [15]:
confusion_matrix(y_test_fixed, y_pred_xgb)

array([[ 2748,    36,     0],
       [   65, 41072,    97],
       [    0,    41,  3820]])

weird okay so both the models did great with training data

are we overfitting? did we memorize?

In [17]:
# okay lets load Headlines.weekly_test_data, take it through the same process and see how it does
con = duckdb.connect(r"C:/Users/jovan/Documents/CSE 6748 Practicum WITH DATA/financial_news.db", read_only=True)
df = con.execute("SELECT * FROM Headlines.weekly_test_data").fetchdf()
con.close()

df["Target_t_7"] = 0  # Default to 'No Movement'
df.loc[df["price_change_t_7"] >= THRESHOLD, "Target_t_7"] = 1  # Surge
df.loc[df["price_change_t_7"] <= -THRESHOLD, "Target_t_7"] = -1  # Drop

df["Target_t_7"].value_counts() # honestly this is pretty balanced

Target_t_7
 0    36359
-1     3705
 1     3552
Name: count, dtype: int64

In [25]:
df.head()

Unnamed: 0,guid,date_t,ticker,subindustry,vix_t,vix_t_7_past,vix_t_7_future,price_t,price_t_7_past,price_change_t_7,volume_t,volume_t_7_past,volume_change_t_7,sentiment_label_t,sentiment_positive_t,sentiment_neutral_t,sentiment_negative_t,Target_t_7
0,9b2ce942-fe3a-33f1-8d05-42731b3044cf,2024-07-29,TPH,,23.389999,16.389999,23.739477,46.029999,46.119999,-0.195143,1010214,1213797,-16.772409,NEGATIVE,6.552353e-05,0.000629,0.9993052,0
1,6b979d74-28cd-33a5-b0b8-7f50b136c299,2024-07-29,ANET,45201020.0,23.389999,16.389999,23.739477,20.034374,19.950624,0.419785,45150192,37617648,20.023963,NEGATIVE,3.016462e-06,2e-06,0.9999951,0
2,868669ed-4c3b-3ffe-b403-3fc62470cc47,2024-07-29,ALLE,20102010.0,23.389999,16.389999,23.739477,137.660004,136.770004,0.650727,757731,1089860,-30.474465,NEUTRAL,0.02495872,0.75342,0.2216212,0
3,4d013850-1182-3ae5-8404-a66875d88ded,2024-07-29,ON,45301020.0,23.389999,16.389999,23.739477,78.269997,70.169998,11.543392,22282731,7764342,186.988015,NEUTRAL,8.019854e-05,0.999838,8.158579e-05,1
4,b6266589-784a-3b21-a62f-c4842ef233f5,2024-07-29,GDDY,45102030.0,23.389999,16.389999,23.739477,143.839996,143.649994,0.132268,1307186,1395490,-6.327813,NEUTRAL,5.580548e-09,1.0,1.250599e-09,0


In [26]:
test_df = df
test_df["volume_change_t_7"] = df["volume_change_t_7"].fillna(0)
categorical_features = ["ticker", "subindustry"]

In [32]:
encoded_cats = encoder.transform(test_df[categorical_features])  # Use trained encoder
encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_features))
test_df = test_df.drop(columns=categorical_features)
test_df = test_df.drop(columns=["guid", "sentiment_label_t"])
date_col = test_df["date_t"].reset_index(drop=True)  # Reset index
target_col = test_df["Target_t_7"].reset_index(drop=True)  # Reset index
test_df_no_date = test_df.drop(columns=["date_t", "Target_t_7"]).reset_index(drop=True)
test_df_no_date.columns

Index(['vix_t', 'vix_t_7_past', 'vix_t_7_future', 'price_t', 'price_t_7_past',
       'price_change_t_7', 'volume_t', 'volume_t_7_past', 'volume_change_t_7',
       'sentiment_positive_t', 'sentiment_neutral_t', 'sentiment_negative_t'],
      dtype='object')

In [34]:
scaled_features = scaler.transform(test_df_no_date)  # Use trained scaler
scaled_df = pd.DataFrame(scaled_features, columns=test_df_no_date.columns)

date_col = test_df["date_t"].reset_index(drop=True)
final_test_df = pd.concat([date_col,target_col, scaled_df, encoded_df], axis=1)
final_test_df.head()

Unnamed: 0,date_t,Target_t_7,vix_t,vix_t_7_past,vix_t_7_future,price_t,price_t_7_past,price_change_t_7,volume_t,volume_t_7_past,...,subindustry_60106010,subindustry_60106020,subindustry_60107010,subindustry_60108010,subindustry_60108020,subindustry_60108030,subindustry_60108040,subindustry_60108050,subindustry_60201040,subindustry_None
0,2024-07-29,0,0.753557,-0.441203,0.801831,-0.031332,-0.031321,-0.109811,-0.159862,-0.151242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2024-07-29,0,0.753557,-0.441203,0.801831,-0.033271,-0.033275,0.12934,1.466899,1.031421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-07-29,0,0.753557,-0.441203,0.801831,-0.024494,-0.024554,0.219156,-0.169167,-0.155268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2024-07-29,1,0.753557,-0.441203,0.801831,-0.028926,-0.029526,4.455404,0.624128,0.061568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-07-29,0,0.753557,-0.441203,0.801831,-0.024033,-0.02404,0.017522,-0.148917,-0.145339,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# fill nans for 0 for now. don't feel liek fixing it 
final_test_df = final_test_df.fillna(0)

In [None]:
X_test_new = final_test_df.drop(columns=["Target_t_7", "date_t"])
y_test_new = final_test_df["Target_t_7"]

y_pred_new = model.predict(X_test_new)

In [43]:
print(classification_report(y_test_new, y_pred_new))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96      3705
           0       1.00      0.99      0.99     36359
           1       0.94      1.00      0.97      3552

    accuracy                           0.99     43616
   macro avg       0.95      0.99      0.97     43616
weighted avg       0.99      0.99      0.99     43616



In [45]:
print(confusion_matrix(y_test_new, y_pred_new))

[[ 3705     0     0]
 [  301 35815   243]
 [    0     1  3551]]


In [44]:
# now for xgboost
y_test_new_fixed = y_test_new + 1
y_pred_xgb_new = xgb_model.predict(X_test_new)

print(classification_report(y_test_new_fixed, y_pred_xgb_new))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3705
           1       1.00      1.00      1.00     36359
           2       0.98      1.00      0.99      3552

    accuracy                           1.00     43616
   macro avg       0.99      0.99      0.99     43616
weighted avg       1.00      1.00      1.00     43616



In [46]:
print(confusion_matrix(y_test_new_fixed, y_pred_xgb_new))

[[ 3677    28     0]
 [   61 36232    66]
 [    0    16  3536]]
