In [1]:
import requests
import pandas as pd
import numpy as np
import json
from datetime import datetime
import time
import os
from dotenv import load_dotenv
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from functools import reduce

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")



In [2]:
df_income = pd.read_csv('data/income_statement.csv')
df_balance = pd.read_csv('data/balance_sheet.csv')
df_cash_flow = pd.read_csv('data/cash_flow_statement.csv')   
df_earnings = pd.read_csv('data/earnings.csv')   
df_overview = pd.read_csv('data/company_overview.csv')

In [3]:
df_technical = pd.read_csv('data/technical_data.csv')

In [4]:
df_price = pd.read_csv('data/tech_stock_daily_price.csv')

In [5]:
df_income['fiscalDateEnding'] = pd.to_datetime(df_income['fiscalDateEnding'])
df_balance['fiscalDateEnding'] = pd.to_datetime(df_balance['fiscalDateEnding'])
df_cash_flow['fiscalDateEnding'] = pd.to_datetime(df_cash_flow['fiscalDateEnding'])
df_earnings['fiscalDateEnding'] = pd.to_datetime(df_earnings['fiscalDateEnding'])
df_technical['date'] = pd.to_datetime(df_technical['date'])
df_price['Date'] = pd.to_datetime(df_price['Date'])

In [6]:
df_earnings = df_earnings[df_earnings['fiscalDateEnding'] >= '2005-01-01']
df_technical = df_technical[df_technical['date']>='2005-01-01']
df_price = df_price[df_price['Date'] >= '2005-01-01']

In [7]:
df_income = df_income.sort_values(by=['symbol', 'fiscalDateEnding'])
df_balance = df_balance.sort_values(by=['symbol', 'fiscalDateEnding'])
df_cash_flow = df_cash_flow.sort_values(by=['symbol', 'fiscalDateEnding'])
df_earnings = df_earnings.sort_values(by=['symbol', 'fiscalDateEnding'])
df_technical = df_technical.sort_values(by=['ticker', 'date'])
df_price = df_price.sort_values(by=['ticker', 'Date'])

In [8]:
symbol= 'AAPL'  # Example symbol, replace with your desired stock symbol

In [10]:

income_data = df_income[df_income['symbol'] == symbol]
balance_data = df_balance[df_balance['symbol'] == symbol]
overview_data = df_overview[df_overview['symbol'] == symbol]
earnings_data = df_earnings[df_earnings['symbol'] == symbol]
cash_flow_data = df_cash_flow[df_cash_flow['symbol'] == symbol]
technical_data = df_technical[df_technical['ticker']==symbol]
price_data = df_price[df_price['ticker'] == symbol]

In [11]:
income_data = income_data.drop(columns=['symbol','reportedCurrency'])

In [12]:
all_nan_columns = income_data.columns[income_data.isna().all()].tolist()

In [13]:
income_data = income_data.drop(columns=all_nan_columns)

In [14]:
balance_data = balance_data.drop(columns=['symbol','reportedCurrency'])

In [15]:
all_nan_columns = balance_data.columns[balance_data.isna().all()].tolist()

In [16]:
balance_data = balance_data.drop(columns=all_nan_columns)

In [17]:
cash_flow_data = cash_flow_data[['fiscalDateEnding','operatingCashflow']]

In [18]:
earnings_data = earnings_data[['fiscalDateEnding','reportedEPS']]

In [27]:
earnings_data

Unnamed: 0,fiscalDateEnding,reportedEPS
80,2005-03-31,0.01
79,2005-06-30,0.01
78,2005-09-30,0.02
77,2005-12-31,0.02
76,2006-03-31,0.02
...,...,...
4,2024-03-31,1.53
3,2024-06-30,1.40
2,2024-09-30,0.97
1,2024-12-31,2.40


In [33]:
merged_df = reduce(lambda left, right: pd.merge(left, right, on='fiscalDateEnding'), [income_data, balance_data, cash_flow_data, earnings_data])

In [34]:
merged_df = merged_df.sort_values(by='fiscalDateEnding')

In [215]:
df = merged_df.merge(technical_data, left_on='fiscalDateEnding',right_on='date')

In [216]:
df = price_data[['Date','Close']].merge(df, right_on='fiscalDateEnding', left_on='Date', how='left')

In [217]:
df = df.fillna(method='ffill')

In [218]:
df['Return'] = df['Close'].pct_change().fillna(0)

In [219]:
df['Direction'] = df['Return'].apply(lambda x: 1 if x > 0 else 0)

In [220]:
df = df[df['Date']>='2005-04-01']

In [221]:
### Exclude Outliers
# df = df[(np.abs(df['Return'] - df['Return'].mean()) <= (3 * df['Return'].std()))]
df = df[df['Date']!='2014-06-09']
df = df[df['Date']!='2020-08-31']
df = df[df['Date']!='2020-09-01']


In [222]:
df = df.drop(columns=['Date', 'date','fiscalDateEnding','ticker','company_name'])

In [223]:
df = df.fillna(0)

In [255]:
df_X = df.drop(columns=['Close','Return','Direction'])

In [256]:
df_Y = df['Direction']

In [257]:
from sklearn.ensemble import RandomForestClassifier

In [258]:
X_train, X_test, y_train, y_test = train_test_split(
    df_X, df_Y, test_size=0.2, random_state=32
)

# 5. Initialize and fit the Random Forest model
rf = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    max_depth=None,    # Let trees expand until all leaves are pure
    random_state=42
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)



In [259]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.5274509803921569
              precision    recall  f1-score   support

           0       0.53      0.26      0.35       495
           1       0.53      0.78      0.63       525

    accuracy                           0.53      1020
   macro avg       0.53      0.52      0.49      1020
weighted avg       0.53      0.53      0.49      1020

Confusion Matrix:
 [[130 365]
 [117 408]]
