In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns




In [3]:
df = pd.read_csv('data/history.csv')

In [4]:
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Symbol
0,2008-01-29,9.5,9.99,8.57,8.75,0.702589,1489000,AACG
1,2008-01-30,8.75,9.15,8.3,8.5,0.682516,219000,AACG
2,2008-01-31,8.49,10.3,8.49,9.55,0.766826,182300,AACG
3,2008-02-01,9.93,9.94,9.5,9.51,0.763614,28200,AACG
4,2008-02-04,9.5,9.71,9.5,9.5,0.762811,8300,AACG


In [6]:
df_category=pd.read_csv('data/sp500_companies.csv')
df_category.head(5)

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [None]:
df_factors=pd.read_csv('data/fed_stock_data_preprocessed.csv')
df_factors.head(5)

In [None]:
df_factors.rename(columns={'Unnamed: 0':'Date'},inplace=True)

In [None]:
df_factors.head(5)

In [None]:
df_category.rename(columns={'GICS Sector': 'Sector'}, inplace=True)
df_category.head(5)

In [None]:
df_new = df.merge(df_category[['Symbol', 'Sector']], on='Symbol', how='left')


In [None]:
df_new.head(5)

In [None]:
df_update= df_new.dropna()

In [None]:
df_update.head()

In [None]:
start_date = '2010-01-01'
end_date = '2024-12-31'

df_filtered = df_update[(df_update['Date'] >= start_date) & (df_update['Date'] <= end_date)]

In [None]:
df_filtered.head(501)

In [None]:
df_filtered.tail()

In [None]:
df_merged = pd.merge(df_filtered, df_factors, on='Date', how='left')

In [None]:
df_merged.head()

In [None]:
df_clean=df_merged[['Open','Close','Sector','FedFundsRate', 'Treasury10Y', 'CPI', 'GDP','Unemployment','VIX']]
df_clean.head()

In [None]:
sectors = df_clean['Sector'].unique()

# Plot heatmap for each sector
for sector in sectors:
    sector_df = df_clean[df_clean['Sector'] == sector]
    corr = sector_df[['Close','Sector','FedFundsRate', 'Treasury10Y', 'CPI', 'GDP','Unemployment','VIX']].corr()

    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title(f"Correlation Heatmap - Sector: {sector}")
    plt.tight_layout()
    plt.show()


In [None]:
df_clean['Return'] = (df_clean['Close']-df_clean['Open'])/df_clean['Open']
df_clean.head()

In [None]:
from sklearn.model_selection import train_test_split


features = ['FedFundsRate', 'Treasury10Y', 'CPI', 'GDP','Unemployment','VIX']
df_model = df_clean.dropna(subset=features + ['Return'])

# Binary classification: Up or Down
df_model['target'] = (df_model['Return'] > 0).astype(int)

X = df_model[features]
y = df_model['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
df_clean['target'] = (df_clean['Return'] > 0).astype(int)

sectors = df_clean['Sector'].dropna().unique()
feature_cols = ['FedFundsRate', 'Treasury10Y', 'CPI', 'GDP','Unemployment','VIX']


for sector in sectors:
    print(f"\n📊 Sector: {sector}")
    
    # Filter for sector and drop NaNs
    df_sector = df_clean[df_clean['Sector'] == sector].dropna(subset=feature_cols + ['target'])

    if len(df_sector) < 100:
        print("  Skipping — not enough data.")
        continue

    # Features and target
    X = df_sector[feature_cols]
    y = df_sector['target']

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Train XGBoost model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    # Evaluation
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Feature importance plot
    importance = model.feature_importances_
    plt.figure(figsize=(6, 4))
    plt.barh(feature_cols, importance)
    plt.title(f"Feature Importance — {sector}")
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.show()
