In [18]:
# Author Yash Raj
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [19]:
data = pd.read_csv('stock_data.csv', parse_dates=['Date'])
data.set_index('Date', inplace=True)
data.sort_index(inplace=True)

In [20]:
data[['Open', 'High', 'Low', 'Volume']] = data[['Open', 'High', 'Low', 'Volume']].fillna(method='ffill')
data[['Open', 'High', 'Low', 'Volume']] = data[['Open', 'High', 'Low', 'Volume']].fillna(method='bfill')


  data[['Open', 'High', 'Low', 'Volume']] = data[['Open', 'High', 'Low', 'Volume']].fillna(method='ffill')
  data[['Open', 'High', 'Low', 'Volume']] = data[['Open', 'High', 'Low', 'Volume']].fillna(method='bfill')


In [21]:
missing_dates = data[data['Close'].isnull()].index.sort_values()
df = data.copy()
lookback = 5

In [22]:
scaler = MinMaxScaler()
available_indices = df.index[df['Close'].notnull()]
scaler.fit(df.loc[available_indices, ['Open', 'High', 'Low', 'Volume', 'Close']])

In [23]:
scaled_data = pd.DataFrame(scaler.transform(df[['Open', 'High', 'Low', 'Volume', 'Close']]),
                           index=df.index,
                           columns=['Open', 'High', 'Low', 'Volume', 'Close'])
scaled_data['Close'] = np.where(df['Close'].isnull(), np.nan, scaled_data['Close'])


In [24]:
class StockDataset(Dataset):
    def __init__(self, df, scaled_data, lookback):
        self.indices = []
        self.X1 = []
        self.X2 = []
        self.Y = []

        available_indices = df.index[df['Close'].notnull()]
        for i in available_indices:
            loc = df.index.get_loc(i)
            if loc >= lookback:
                past_indices = df.index[loc - lookback:loc]
                if not df.loc[past_indices, 'Close'].isnull().any():
                    self.indices.append(i)
                    past_sequence = scaled_data.loc[past_indices, ['Open', 'High', 'Low', 'Volume', 'Close']].values
                    current_features = scaled_data.loc[i, ['Open', 'High', 'Low', 'Volume']].values
                    target = scaled_data.loc[i, 'Close']
                    self.X1.append(past_sequence)
                    self.X2.append(current_features)
                    self.Y.append(target)

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X1[idx], dtype=torch.float32),
            torch.tensor(self.X2[idx], dtype=torch.float32),
            torch.tensor(self.Y[idx], dtype=torch.float32)
        )


In [25]:
dataset = StockDataset(df, scaled_data, lookback)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [26]:
class HybridLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=5, hidden_size=64, batch_first=True)
        self.dense_branch = nn.Sequential(
            nn.Linear(4, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        self.combined = nn.Sequential(
            nn.Linear(64 + 16, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x_seq, x_curr):
        lstm_out, _ = self.lstm(x_seq)
        lstm_out = lstm_out[:, -1, :]  # Take last time step output
        dense_out = self.dense_branch(x_curr)
        combined = torch.cat((lstm_out, dense_out), dim=1)
        return self.combined(combined)

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridLSTM().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [37]:
model.train()
for epoch in range(100):
    total_loss = 0
    for x1_batch, x2_batch, y_batch in dataloader:
        x1_batch, x2_batch, y_batch = x1_batch.to(device), x2_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x1_batch, x2_batch)
        loss = criterion(outputs.squeeze(), y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/100, Loss: {total_loss/len(dataloader):.6f}')


Epoch 1/100, Loss: 0.000197
Epoch 2/100, Loss: 0.000167
Epoch 3/100, Loss: 0.000210
Epoch 4/100, Loss: 0.000167
Epoch 5/100, Loss: 0.000218
Epoch 6/100, Loss: 0.000356
Epoch 7/100, Loss: 0.000152
Epoch 8/100, Loss: 0.000163
Epoch 9/100, Loss: 0.000146
Epoch 10/100, Loss: 0.000249
Epoch 11/100, Loss: 0.000210
Epoch 12/100, Loss: 0.000153
Epoch 13/100, Loss: 0.000147
Epoch 14/100, Loss: 0.000215
Epoch 15/100, Loss: 0.000140
Epoch 16/100, Loss: 0.000136
Epoch 17/100, Loss: 0.000137
Epoch 18/100, Loss: 0.000139
Epoch 19/100, Loss: 0.000225
Epoch 20/100, Loss: 0.000136
Epoch 21/100, Loss: 0.000143
Epoch 22/100, Loss: 0.000130
Epoch 23/100, Loss: 0.000129
Epoch 24/100, Loss: 0.000205
Epoch 25/100, Loss: 0.000156
Epoch 26/100, Loss: 0.000190
Epoch 27/100, Loss: 0.000178
Epoch 28/100, Loss: 0.000139
Epoch 29/100, Loss: 0.000124
Epoch 30/100, Loss: 0.000140
Epoch 31/100, Loss: 0.000161
Epoch 32/100, Loss: 0.000135
Epoch 33/100, Loss: 0.000137
Epoch 34/100, Loss: 0.000139
Epoch 35/100, Loss: 0.0

In [38]:
model.eval()
scaled_df_pred = scaled_data.copy()
predictions = []

In [39]:
with torch.no_grad():
    for date in missing_dates:
        loc = df.index.get_loc(date)
        if loc < lookback:
            continue

        past_sequence = scaled_df_pred.iloc[loc - lookback:loc][['Open', 'High', 'Low', 'Volume', 'Close']].values
        current_features = scaled_df_pred.loc[date, ['Open', 'High', 'Low', 'Volume']].values
        if np.isnan(past_sequence).any():
            past_sequence = pd.DataFrame(past_sequence).fillna(method='ffill').fillna(method='bfill').values


        x_seq = torch.tensor(past_sequence, dtype=torch.float32).unsqueeze(0).to(device)
        x_curr = torch.tensor(current_features, dtype=torch.float32).unsqueeze(0).to(device)

        scaled_close = model(x_seq, x_curr).item()


        scaled_df_pred.loc[date, 'Close'] = scaled_close


        scaled_row = np.append(current_features, scaled_close)
        actual_close = scaler.inverse_transform([scaled_row])[0][-1]
        predictions.append((date, actual_close))

In [40]:
submission = pd.DataFrame(predictions, columns=['Date', 'Close'])
print(submission)

         Date       Close
0  2024-01-06  329.652423
1  2024-01-09  349.270743
2  2024-02-21  320.952833
3  2024-03-03  311.548908
4  2024-03-06  316.360299
5  2024-03-20  303.280108
6  2024-03-29  299.238715
7  2024-04-08  294.162173
8  2024-04-09  297.827902
9  2024-04-18  291.379943
10 2024-04-26  315.549539
11 2024-05-01  309.497104
12 2024-05-03  313.482450
13 2024-05-17  313.013314
14 2024-05-19  315.202244
15 2024-05-28  342.500368
16 2024-06-16  357.833848
17 2024-06-25  353.648309
18 2024-06-26  359.771458
