In [290]:
import requests
import pandas as pd
import numpy as np
import json
from datetime import datetime
import time
import os
from dotenv import load_dotenv
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from functools import reduce

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")



In [366]:
df_income = pd.read_csv('../data/income_statement.csv')
df_balance = pd.read_csv('../data/balance_sheet.csv')
df_cash_flow = pd.read_csv('../data/cash_flow_statement.csv')   
df_earnings = pd.read_csv('../data/earnings.csv')   
df_overview = pd.read_csv('../data/company_overview.csv')

In [367]:
df_technical = pd.read_csv('../data/technical_data.csv')

In [368]:
df_price = pd.read_csv('../data/tech_stock_daily_price.csv')

In [369]:
df_sentiment = pd.read_csv('../data/daily_sentiment_score.csv')

In [370]:
df_income['fiscalDateEnding'] = pd.to_datetime(df_income['fiscalDateEnding'])
df_balance['fiscalDateEnding'] = pd.to_datetime(df_balance['fiscalDateEnding'])
df_cash_flow['fiscalDateEnding'] = pd.to_datetime(df_cash_flow['fiscalDateEnding'])
df_earnings['fiscalDateEnding'] = pd.to_datetime(df_earnings['fiscalDateEnding'])
df_technical['date'] = pd.to_datetime(df_technical['date'])
df_price['Date'] = pd.to_datetime(df_price['Date'])
df_sentiment['date'] = pd.to_datetime(df_sentiment['date'])


In [371]:
start_date = '2023-12-29'
end_date = '2025-06-29'

In [372]:
df_income = df_income[df_income['fiscalDateEnding'] >= start_date]
df_balance = df_balance[df_balance['fiscalDateEnding'] >= start_date]
df_cash_flow = df_cash_flow[df_cash_flow['fiscalDateEnding']>= start_date]

df_earnings = df_earnings[df_earnings['fiscalDateEnding'] >= start_date]
df_technical = df_technical[(df_technical['date']>=start_date) & (df_technical['date'] < end_date)]
df_price = df_price[(df_price['Date'] >= start_date)&(df_price['Date'] <end_date)]
df_sentiment = df_sentiment[(df_sentiment['date'] >= start_date)&(df_sentiment['date'] < end_date)]

In [373]:
print(df_technical['date'].min())
print(df_price['Date'].min())
print(df_sentiment['date'].min())
print(df_earnings['fiscalDateEnding'].min())
print(df_income['fiscalDateEnding'].min())
print(df_balance['fiscalDateEnding'].min())
print(df_cash_flow['fiscalDateEnding'].min())

2023-12-29 00:00:00
2023-12-29 00:00:00
2024-01-01 00:00:00
2023-12-31 00:00:00
2023-12-31 00:00:00
2023-12-31 00:00:00
2023-12-31 00:00:00


In [374]:
df_income = df_income.sort_values(by=['symbol', 'fiscalDateEnding'])
df_balance = df_balance.sort_values(by=['symbol', 'fiscalDateEnding'])
df_cash_flow = df_cash_flow.sort_values(by=['symbol', 'fiscalDateEnding'])
df_earnings = df_earnings.sort_values(by=['symbol', 'fiscalDateEnding'])
df_technical = df_technical.sort_values(by=['ticker', 'date'])
df_price = df_price.sort_values(by=['ticker', 'Date'])
df_sentiment = df_sentiment.sort_values(by=['ticker', 'date'])

In [375]:
symbol='AAPL'

In [477]:

income_data = df_income[df_income['symbol'] == symbol]
balance_data = df_balance[df_balance['symbol'] == symbol]
overview_data = df_overview[df_overview['symbol'] == symbol]
earnings_data = df_earnings[df_earnings['symbol'] == symbol]
cash_flow_data = df_cash_flow[df_cash_flow['symbol'] == symbol]
technical_data = df_technical[df_technical['ticker']==symbol]
price_data = df_price[df_price['ticker'] == symbol]
sentiment_data = df_sentiment[df_sentiment['ticker'] == symbol]

In [478]:
income_data = income_data.drop(columns=['symbol','reportedCurrency'])

In [479]:
all_nan_columns = income_data.columns[income_data.isna().all()].tolist()

In [480]:
income_data = income_data.drop(columns=all_nan_columns)

In [481]:
balance_data = balance_data.drop(columns=['symbol','reportedCurrency'])

In [482]:
all_nan_columns = balance_data.columns[balance_data.isna().all()].tolist()

In [483]:
balance_data = balance_data.drop(columns=all_nan_columns)

In [484]:
cash_flow_data = cash_flow_data[['fiscalDateEnding','operatingCashflow']]

In [485]:
earnings_data = earnings_data[['fiscalDateEnding','reportedEPS']]

In [486]:
merged_earnings_df = reduce(lambda left, right: pd.merge(left, right, on='fiscalDateEnding'), [income_data, balance_data, cash_flow_data, earnings_data])

In [487]:
merged_earnings_df = merged_earnings_df.sort_values(by='fiscalDateEnding')

### Combining dfs

In [488]:
all_dates = pd.date_range(start=start_date, end=end_date, freq='D')


In [489]:
df_dates = pd.DataFrame(all_dates,columns=['all_date'])

In [537]:
df = df_dates.merge(price_data[['Date','Close']], left_on='all_date',right_on='Date', how='left')

In [538]:
df = df.merge(merged_earnings_df, left_on='all_date', right_on='fiscalDateEnding', how='left')

In [539]:
df[merged_earnings_df.columns] = df[merged_earnings_df.columns].fillna(method='ffill')

In [540]:
df = df.merge(sentiment_data[['date','sentiment_score']], left_on='all_date', right_on='date', how='left')

In [541]:
df['sentiment_score'] = df['sentiment_score'].fillna(0)

In [542]:
technical_data = technical_data.drop(columns=['ticker','company_name'])

KeyError: "['ticker', 'company_name'] not found in axis"

In [543]:
df = df.merge(technical_data, left_on='all_date', right_on='date', how='left')

In [544]:
df = df[df['all_date']>='2024-01-02']

In [545]:
df = df.drop(columns=['all_date','Date','fiscalDateEnding','date_x','date_y'])

In [546]:
df = df[~(df['Close'].isna())]

In [547]:
df['Return'] = df['Close'].pct_change().fillna(0)

In [548]:
lags = [1, 2, 3, 5, 10, 20]
for lag in lags:
    df[f'return_lag{lag}'] = df['Return'].shift(lag)

In [549]:
df = df[20:]

In [550]:
df['Direction'] = df['Return'].apply(lambda x: 1 if x > 0 else 0)

In [551]:
### Exclude Outliers
df = df[(np.abs(df['Return'] - df['Return'].mean()) <= (3 * df['Return'].std()))]
# df = df[df['Date']!='2014-06-09']
# df = df[df['Date']!='2020-08-31']
# df = df[df['Date']!='2020-09-01']


In [552]:
df = df.fillna(method='ffill')

In [553]:
df_X = df.drop(columns=['Close','Return','Direction'])

In [555]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_X = scaler.fit_transform(df_X[df_X.columns])


In [556]:
df_Y = df['Direction']

In [557]:
from sklearn.ensemble import RandomForestClassifier

In [558]:
X_train, X_test, y_train, y_test = train_test_split(
    df_X, df_Y, test_size=0.2, random_state=32
)

# 5. Initialize and fit the Random Forest model
rf = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    max_depth=None,    # Let trees expand until all leaves are pure
    random_state=42
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)



In [559]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6714285714285714
              precision    recall  f1-score   support

           0       0.69      0.63      0.66        35
           1       0.66      0.71      0.68        35

    accuracy                           0.67        70
   macro avg       0.67      0.67      0.67        70
weighted avg       0.67      0.67      0.67        70

Confusion Matrix:
 [[22 13]
 [10 25]]
