In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
from google.colab import files
uploaded = files.upload()

Saving stock_data.csv to stock_data.csv


In [3]:
df=pd.read_csv('stock_data.csv')

In [4]:


# Show everything
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Show your DataFrame
df  # or just df


Unnamed: 0,Close,High,Low,Open,Volume,Date
0,330.967834,333.926313,330.400002,333.258269,1478688.0,2024-01-01
1,332.165466,334.785146,330.51444,330.51444,,2024-01-02
2,340.239258,341.1793,329.655551,331.635823,10046344.0,2024-01-03
3,335.729919,341.704144,331.025005,341.188806,11098328.0,2024-01-04
4,329.932312,336.493453,328.935023,335.839738,7713688.0,2024-01-05
5,,331.635762,326.482286,330.299676,8990392.0,2024-01-06
6,329.326355,332.303916,326.181784,328.295655,12590808.0,2024-01-07
7,338.91275,341.179326,329.05911,333.06737,30258176.0,2024-01-08
8,,350.72278,341.847347,341.847347,17119912.0,2024-01-09
9,351.844177,352.631514,347.425542,347.859775,7425832.0,2024-01-10


In [5]:

close_nan_positions = df[df['Close'].isna()].index.tolist()


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   177 non-null    float64
 1   High    187 non-null    float64
 2   Low     187 non-null    float64
 3   Open    187 non-null    float64
 4   Volume  187 non-null    float64
 5   Date    196 non-null    object 
dtypes: float64(5), object(1)
memory usage: 9.3+ KB


In [7]:
df.describe()

Unnamed: 0,Close,High,Low,Open,Volume
count,177.0,187.0,187.0,187.0,187.0
mean,332.245639,334.304271,327.62439,330.909857,9329508.0
std,24.696451,24.452085,24.204427,23.850239,6462652.0
min,282.932861,286.314085,278.627643,284.432385,0.0
25%,311.066559,313.939915,308.280569,310.6699,5600116.0
50%,335.729919,332.031376,326.181784,330.299676,7548064.0
75%,353.213684,356.32422,348.968663,352.676836,10761510.0
max,381.424927,385.531779,379.078827,383.596848,45893060.0


In [8]:
df['Low'] = df['Low'].fillna(df['Low'].min())


In [9]:
df['High'] = df['High'].fillna(df['High'].max())


In [10]:
new_df =pd.DataFrame ({
    'Close_normalized' : (df['Close']-df['Low'])/(df['High']-df['Low']),
    'Open_normalized' : (df['Open']-df['Low'])/(df['High']-df['Low']),
    'Volume_normalized' : (df['Volume']-df['Volume'].min())/(df['Volume'].max()-df['Volume'].min())
    })

In [11]:
new_df

Unnamed: 0,Close_normalized,Open_normalized,Volume_normalized
0,0.161027,0.810555,0.03222
1,0.386593,0.0,
2,0.918426,0.171843,0.218908
3,0.440571,0.951743,0.24183
4,0.131944,0.913512,0.16808
5,,0.740741,0.195899
6,0.51364,0.345283,0.274351
7,0.812992,0.330709,0.659319
8,,0.0,0.373039
9,0.848763,0.083411,0.161807


In [12]:
nan_mask = new_df.isna()
df_filled = new_df.fillna(0)

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, data, mask, window_size):
        self.data = data
        self.mask = mask
        self.window_size = window_size
        self.samples = []

        for i in range(len(data) - window_size):
            window = data[i:i+window_size].values
            mask_win = mask[i:i+window_size].values
            self.samples.append((window, mask_win))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, mask = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(mask, dtype=torch.float32)


In [14]:
# Loading Datset
dataset = TimeSeriesDataset(df_filled, nan_mask, window_size=10)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [15]:
import torch.nn as nn

class LSTMImputer(nn.Module):
    def __init__(self, input_size, hidden_size=16, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.output = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.output(out)
        return out


In [16]:
# Loss and optimizer
model = LSTMImputer(input_size=3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [17]:
for epoch in range(1000):
    for batch_x, batch_mask in loader:
        batch_x, batch_mask = batch_x.to(device), batch_mask.to(device)
        pred = model(batch_x)
        loss = criterion(pred[batch_mask == 0], batch_x[batch_mask == 0])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    if epoch % 100 == 99:
      print(f"Epoch {epoch}: Loss = {loss.item():.4f}")


Epoch 99: Loss = 0.0023
Epoch 199: Loss = 0.0004
Epoch 299: Loss = 0.0002
Epoch 399: Loss = 0.0001
Epoch 499: Loss = 0.0001
Epoch 599: Loss = 0.0000
Epoch 699: Loss = 0.0000
Epoch 799: Loss = 0.0000
Epoch 899: Loss = 0.0000
Epoch 999: Loss = 0.0000


In [18]:
import torch

# Convert the entire filled sequence to tensor
full_input = torch.tensor(df_filled.values, dtype=torch.float32).unsqueeze(0)  # shape: (1, T, features)
full_input = full_input.to(device)


In [19]:
model.eval()
with torch.no_grad():
    predicted_sequence = model(full_input)  # shape: (1, T, features)
    predicted_sequence = predicted_sequence.squeeze(0)
    predicted_sequence = predicted_sequence.cpu().numpy()


In [20]:
# Make a copy of the DataFrame
imputed_df = df_filled.copy()

# Loop over each column
for i, col in enumerate(imputed_df.columns):
    # Replace missing values (according to nan_mask) with predicted values
    imputed_df.loc[nan_mask[col], col] = predicted_sequence[nan_mask[col].values, i]


In [21]:
imputed_df

Unnamed: 0,Close_normalized,Open_normalized,Volume_normalized
0,0.161027,0.810555,0.03222
1,0.386593,0.0,-0.000427
2,0.918426,0.171843,0.218908
3,0.440571,0.951743,0.24183
4,0.131944,0.913512,0.16808
5,-0.004794,0.740741,0.195899
6,0.51364,0.345283,0.274351
7,0.812992,0.330709,0.659319
8,0.014672,0.0,0.373039
9,0.848763,0.083411,0.161807


In [22]:
predicted_df = pd.DataFrame({
    'Date' : df['Date'],
    'Close' : (imputed_df['Close_normalized']*(df['High']-df['Low'])+df['Low']),
})

In [23]:
predicted_df

Unnamed: 0,Date,Close
0,2024-01-01,330.967834
1,2024-01-02,332.165466
2,2024-01-03,340.239258
3,2024-01-04,335.729919
4,2024-01-05,329.932312
5,2024-01-06,326.457579
6,2024-01-07,329.326355
7,2024-01-08,338.91275
8,2024-01-09,341.977568
9,2024-01-10,351.844177
