### Stock Price Forecasting
- Goal: Predict next-day stock price from past sequence data.
- Dataset: Yahoo Finance (https://pypi.org/project/yfinance/)
- Model: LSTM for regression
- Task Type: Regression
- Extension: Add multi-step forecasting; compare CNN vs. LSTM.

In [8]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt 
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.tensorboard import SummaryWriter
import yfinance as yf
from LSTM import SimpleLSTM

In [2]:
dat = yf.Ticker("GOOGL")
dat.info

{'address1': '1600 Amphitheatre Parkway',
 'city': 'Mountain View',
 'state': 'CA',
 'zip': '94043',
 'country': 'United States',
 'phone': '650-253-0000',
 'website': 'https://abc.xyz',
 'industry': 'Internet Content & Information',
 'industryKey': 'internet-content-information',
 'industryDisp': 'Internet Content & Information',
 'sector': 'Communication Services',
 'sectorKey': 'communication-services',
 'sectorDisp': 'Communication Services',
 'longBusinessSummary': 'Alphabet Inc. offers various products and platforms in the United States, Europe, the Middle East, Africa, the Asia-Pacific, Canada, and Latin America. It operates through Google Services, Google Cloud, and Other Bets segments. The Google Services segment provides products and services, including ads, Android, Chrome, devices, Gmail, Google Drive, Google Maps, Google Photos, Google Play, Search, and YouTube. It is also involved in the sale of apps and in-app purchases and digital content in the Google Play and YouTube;

In [3]:
symbols = ['GOOGL']
for symbol in symbols:
    stock = yf.Ticker(symbol)
    info = stock.info
    print(f"Symbol: {symbol}")
    print(f"Name: {info.get('shortName', 'N/A')}")
    print(f"Market: {info.get('market', 'N/A')}")
    print(f"Sector: {info.get('sector', 'N/A')}")
    print("-" * 30)

Symbol: GOOGL
Name: Alphabet Inc.
Market: us_market
Sector: Communication Services
------------------------------


In [4]:
spy = yf.Ticker('SPY').funds_data
spy.description
spy.top_holdings

Unnamed: 0_level_0,Name,Holding Percent
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
NVDA,NVIDIA Corp,0.079597
MSFT,Microsoft Corp,0.067313
AAPL,Apple Inc,0.066069
AMZN,Amazon.com Inc,0.037257
META,Meta Platforms Inc Class A,0.027848
AVGO,Broadcom Inc,0.02713
GOOGL,Alphabet Inc Class A,0.024724
TSLA,Tesla Inc,0.021818
GOOG,Alphabet Inc Class C,0.019884
BRK-B,Berkshire Hathaway Inc Class B,0.016118


In [5]:
#One ticker symbol
dat.info
dat.calendar
dat.analyst_price_targets
dat.quarterly_income_stmt
dat.history(period='1mo')
dat.option_chain(dat.options[0]).calls

Unnamed: 0,contractSymbol,lastTradeDate,strike,lastPrice,bid,ask,change,percentChange,volume,openInterest,impliedVolatility,inTheMoney,contractSize,currency
0,GOOGL251107C00100000,2025-10-30 13:33:29+00:00,100.0,185.60,183.45,186.20,0.000000,0.000000,8.0,9,7.468751,True,REGULAR,USD
1,GOOGL251107C00120000,2025-11-05 17:47:03+00:00,120.0,164.85,162.95,166.65,0.000000,0.000000,1.0,4,5.984378,True,REGULAR,USD
2,GOOGL251107C00125000,2025-11-05 17:48:50+00:00,125.0,156.87,157.95,161.65,-3.029999,-1.894934,1.0,8,5.718753,True,REGULAR,USD
3,GOOGL251107C00130000,2025-11-06 16:10:20+00:00,130.0,152.90,153.50,156.20,-1.950012,-1.259291,17.0,1,5.875003,True,REGULAR,USD
4,GOOGL251107C00135000,2025-11-06 16:05:07+00:00,135.0,147.70,147.95,151.65,2.720001,1.876122,5.0,3,5.218753,True,REGULAR,USD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,GOOGL251107C00360000,2025-10-31 19:34:58+00:00,360.0,0.01,0.00,0.01,0.000000,0.000000,20.0,29,1.406253,False,REGULAR,USD
71,GOOGL251107C00365000,2025-11-04 14:30:02+00:00,365.0,0.02,0.00,0.01,0.000000,0.000000,1.0,215,1.500002,False,REGULAR,USD
72,GOOGL251107C00370000,2025-11-03 15:45:14+00:00,370.0,0.01,0.00,0.01,0.000000,0.000000,1.0,225,1.562502,False,REGULAR,USD
73,GOOGL251107C00390000,2025-10-30 17:21:44+00:00,390.0,0.01,0.00,0.01,0.000000,0.000000,,1,1.875001,False,REGULAR,USD


In [6]:
# Fetching Historical Stock Data
from tabulate import tabulate

start = '2025-01-01'
end = '2025-11-06'

stock = yf.Ticker('GOOGL')
historical_data = stock.history(start=start, end= end)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print(f"Historical Data for {symbol} from {start} to {end}")

formatted_data = pd.concat([historical_data.head(), historical_data.tail()])
print(tabulate(formatted_data, headers='keys', tablefmt='psql'))

print("\nShowing only the first and last 5 rows of data:")
print(tabulate(formatted_data, headers='keys', tablefmt='grid'))

Historical Data for GOOGL from 2025-01-01 to 2025-11-06
+---------------------------+---------+---------+---------+---------+-------------+-------------+----------------+
| Date                      |    Open |    High |     Low |   Close |      Volume |   Dividends |   Stock Splits |
|---------------------------+---------+---------+---------+---------+-------------+-------------+----------------|
| 2025-01-02 00:00:00-05:00 | 190.03  | 191.376 | 186.891 | 188.814 | 2.03708e+07 |           0 |              0 |
| 2025-01-03 00:00:00-05:00 | 190.748 | 192.582 | 189.363 | 191.167 | 1.85962e+07 |           0 |              0 |
| 2025-01-06 00:00:00-05:00 | 193.35  | 197.576 | 193.22  | 196.23  | 2.95636e+07 |           0 |              0 |
| 2025-01-07 00:00:00-05:00 | 196.469 | 200.347 | 193.968 | 194.855 | 2.64872e+07 |           0 |              0 |
| 2025-01-08 00:00:00-05:00 | 191.944 | 195.652 | 191.755 | 193.32  | 2.48648e+07 |           0 |              0 |
| 2025-10-30 00:00:00-04

In [27]:
# Tai toan bo du lieu tu 5 nam tro lai
dat = yf.download('GOOGL', period='2y')
# Luu vao file
dat.to_csv('GOOGL_2y.csv')



  dat = yf.download('GOOGL', period='2y')
[*********************100%***********************]  1 of 1 completed


In [7]:
df = pd.read_csv('GOOGL_2y.csv', index_col='Date', parse_dates=True)
df.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-11-07,130.072861,131.006424,128.990331,129.814647,29757300
2023-11-08,130.93692,131.304396,129.884183,130.072884,26425800
2023-11-09,129.347885,131.64206,129.179051,131.056105,23747800
2023-11-10,131.681763,131.890331,128.523553,129.208829,26913300
2023-11-13,131.185211,131.681786,130.350969,130.877337,18324800


# Reproducility & Cấu hình 
Tại sao phải đảm bảo Reproducibility?

Trong mô hình học máy (đặc biệt là LSTM, RNN), có nhiều thứ ngẫu nhiên:

_ Thành phần	Gây biến động kết quả
- Khởi tạo trọng số mô hình | trọng số random → mỗi lần học khác nhau
- Shuffle dữ liệu trong DataLoader | thay đổi cách mô hình học
- Các toán tử trên GPU | một số tính toán không cố định tuyệt đối
- Dropout / data augmentation | tạo random trong quá trình training

➡️ Nếu không cố định seed, bạn chạy hôm nay và chạy lại mai → kết quả khác nhau → không thể tin tưởng hay so sánh.

In [None]:
# Reproducibility: cố định seed để kết quả lặp lại
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

#Cau hinh
CSV_PATH = "GOOGL_2y.csv"
SEQ_LEN = 60 # Window length (60 ngay) dùng 60 ngày trước để dự đoán ngày kế tiếp
BATCH_SIZE = 32
HIDDEN_SIZE = 64
NUM_LAYERS = 2
LR = 1e-3
EPOCHS = 50
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ",DEVICE)

Device:  cpu


In [11]:
if not os.path.exists(CSV_PATH):
    print("Không tìm thấy file sạch. Tải dữ liệu GOOGL 2 năm từ yfinance...")
    df_raw = yf.download("GOOGL", period="2y", progress=False)
    # Lưu file gốc tạm
    df_raw.to_csv("GOOGL_2y_raw.csv")
    # Chuẩn hóa tên cột: đảm bảo có Open, High, Low, Close, Volume
    df = df_raw.copy()
    df.to_csv(CSV_PATH)
    print("Đã tải và lưu:", CSV_PATH)
else:
    df = pd.read_csv(CSV_PATH, index_col="Date", parse_dates=True)
    print("Đã đọc file:", CSV_PATH)

print("Dữ liệu tổng:", df.shape)
print(df.head())

Đã đọc file: GOOGL_2y.csv
Dữ liệu tổng: (501, 5)
                 Close        High         Low        Open    Volume
Date                                                                
2023-11-07  130.072861  131.006424  128.990331  129.814647  29757300
2023-11-08  130.936920  131.304396  129.884183  130.072884  26425800
2023-11-09  129.347885  131.642060  129.179051  131.056105  23747800
2023-11-10  131.681763  131.890331  128.523553  129.208829  26913300
2023-11-13  131.185211  131.681786  130.350969  130.877337  18324800


# Chuan hoa & tao sequence

In [12]:
# Lấy cột Close, fit scaler CHỈ trên train (sẽ xác định cutoff sau)
close = df[['Close']].copy()

# Cutoff index theo ti le 70/15/15 tren time series
n_total = len(close)
#Tao sequences, tong samples = n_total - SEQ_LEN
n_samples = n_total - SEQ_LEN
train_samples = int(n_samples*0.70)
val_samples = int(n_samples * 0.15)
test_samples = n_samples - train_samples - val_samples

print(f"Total days: {n_total}, sequence samples: {n_samples}")
print(f"Train samples: {train_samples}, Val samples: {val_samples}, Test samples: {test_samples}")

# --- Fit scaler CHỈ trên phần train (tránh data leakage) ---
# Tuy nhiên scaler cần dữ liệu dạng time-index trước khi tạo sequences.
# Ta fit scaler trên phần 'raw' Close tương ứng với vùng train:
# Range of raw rows included in train portion = train_samples + SEQ_LEN (vì first sample uses first SEQ_LEN days)
train_raw_end = SEQ_LEN + train_samples #exclusive index for raw array slicing
print("train_raw end index (exclusive): ", train_raw_end)      \

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(close.values[:train_raw_end]) # fit only on train portion of raw close values
scaled_all = scaler.transform(close.values) # transform whole series

# tao sequences (sliding window)
X_list, y_list = [], []
for i in range(SEQ_LEN, len(scaled_all)):
    X_list.append(scaled_all[i-SEQ_LEN:i, 0]) # window of length SEQ_LEN
    y_list.append(scaled_all[i, 0]) # next-day target

X = np.array(X_list)
y = np.array(y_list)

#reshape X -> (n_samples, seq_len, n_features)
X = X.reshape(X.shape[0], X.shape[1],1)

print("X shape:", X.shape, "y shape:", y.shape)

Total days: 501, sequence samples: 441
Train samples: 308, Val samples: 66, Test samples: 67
train_raw end index (exclusive):  368
X shape: (441, 60, 1) y shape: (441,)
