# Import Library

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, IterableDataset
from tqdm import tqdm


# Read Data and Build Autoencoder Model

### EPOCH: 50

In [None]:
# 🔹 Load data dari .npz
data = np.load("/kaggle/input/finbert-a-embeddings-sector/finbert_embeddings_all_sectors.npz", allow_pickle=True)
embeddings = data["Embedding"]  # Shape: (94032, 25, 768)
print(embeddings.shape)  # Pastikan outputnya benar

# 🔹 Dataset Custom (IterableDataset)
class EmbeddingDataset(IterableDataset):
    def __init__(self, embeddings):
        super().__init__()
        self.embeddings = embeddings

    def __iter__(self):
        for emb in self.embeddings:
            yield torch.tensor(emb, dtype=torch.float32)  # Konversi ke tensor saat iterasi

# 🔹 Load Data dengan DataLoader
dataset = EmbeddingDataset(embeddings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)  # shuffle=False karena IterableDataset

# 🔹 Model Autoencoder
class Seq2SeqAutoencoder(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=768, num_layers=4, dropout=0.3):
        super(Seq2SeqAutoencoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Encoder
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)

        # Decoder
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, input_dim)  # Output layer ke dimensi asli

    def forward(self, x):
        batch_size, seq_len, _ = x.shape

        # Encoder
        _, (hidden, cell) = self.encoder(x)

        # Decoder
        decoder_input = torch.zeros(batch_size, 1, self.hidden_dim).to(x.device)  # Input pertama (zero vector)
        outputs = []

        for _ in range(seq_len):
            out, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            pred = self.fc_out(out)
            outputs.append(pred)
            decoder_input = pred

        outputs = torch.cat(outputs, dim=1)  # [batch_size, seq_len, input_dim]
        return outputs, hidden  # Hidden = Thought Vector

# 🔹 Inisialisasi Model, Optimizer, Loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2SeqAutoencoder().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# 🔹 Training Loop
epochs = 50

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        batch = batch.to(device)  # Pindahkan ke GPU jika tersedia
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")


torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Update Dataset

In [None]:
# 🔹 Load data dari .npz
data = np.load("/kaggle/input/finbert-embeddings-sectoral/finbert_embeddings_all_sectors.npz", allow_pickle=True)
embeddings = data["Embedding"]  # Shape: (94032, 25, 768)
print(embeddings.shape)  # Pastikan outputnya benar

# 🔹 Dataset Custom (IterableDataset)
class EmbeddingDataset(IterableDataset):
    def __init__(self, embeddings):
        super().__init__()
        self.embeddings = embeddings

    def __iter__(self):
        for emb in self.embeddings:
            yield torch.tensor(emb, dtype=torch.float32)  # Konversi ke tensor saat iterasi

# 🔹 Load Data dengan DataLoader
dataset = EmbeddingDataset(embeddings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)  # shuffle=False karena IterableDataset

# 🔹 Model Autoencoder
class Seq2SeqAutoencoder(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=768, num_layers=4, dropout=0.3):
        super(Seq2SeqAutoencoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Encoder
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)

        # Decoder
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, input_dim)  # Output layer ke dimensi asli

    def forward(self, x):
        batch_size, seq_len, _ = x.shape

        # Encoder
        _, (hidden, cell) = self.encoder(x)

        # Decoder
        decoder_input = torch.zeros(batch_size, 1, self.hidden_dim).to(x.device)  # Input pertama (zero vector)
        outputs = []

        for _ in range(seq_len):
            out, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            pred = self.fc_out(out)
            outputs.append(pred)
            decoder_input = pred

        outputs = torch.cat(outputs, dim=1)  # [batch_size, seq_len, input_dim]
        return outputs, hidden  # Hidden = Thought Vector

# 🔹 Inisialisasi Model, Optimizer, Loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2SeqAutoencoder().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Melanjutkan training. EPOCH: 50-60

In [None]:
# Lanjutkan training
epochs = 50
additional_epochs = 20  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/input/finbert_autoencoder_lstm.pth/pytorch/default/1/finbert_autoencoder_lstm.pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Melanjutkan training. EPOCH: 59 - 70

In [None]:
# Lanjutkan training
epochs = 58
additional_epochs = 12  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/working/finbert_autoencoder_lstm.pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Melanjutkan training. EPOCH: 71-90

In [None]:
# Lanjutkan training
epochs = 70
additional_epochs = 20  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/working/finbert_autoencoder_lstm.pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Melanjutkan training. EPOCH: 91 - 150

In [None]:
# Lanjutkan training
epochs = 90
additional_epochs = 60  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/input/finbert-seq2seq/pytorch/default/1/finbert_autoencoder.pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Melanjutkan training. EPOCH: 150 - 180

In [None]:
# Lanjutkan training
epochs = 150
additional_epochs = 30  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/input/autoencoder/pytorch/default/1/finbert_autoencoder_lstm (1).pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Melanjutkan training. EPOCH: 166 - 180

In [None]:
# Lanjutkan training
epochs = 166
additional_epochs = 14  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/working/finbert_autoencoder_lstm.pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Melanjutkan training. EPOCH: 180-200

In [None]:
# Lanjutkan training
epochs = 180
additional_epochs = 20  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/working/finbert_autoencoder_lstm.pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Melanjutkan training. EPOCH: 200 - 205

In [None]:
# Lanjutkan training
epochs = 200
additional_epochs = 5  # Jumlah epoch tambahan
total_epochs = epochs + additional_epochs  # Total epoch setelah dilanjutkan

# Muat model yang sudah disimpan
model.load_state_dict(torch.load("/kaggle/working/finbert_autoencoder_lstm.pth", weights_only=True))
print("Model loaded. Melanjutkan training...")

for epoch in range(epochs, total_epochs):
    model.train()
    epoch_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()

        output, hidden = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_count += 1  
        
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / batch_count:.6f}")

    # Simpan model setiap epoch (opsional)
    torch.save(model.state_dict(), "finbert_autoencoder_lstm.pth")

# Inference

### Load Data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, IterableDataset
import numpy as np
from tqdm import tqdm

# 2️⃣ Load embedding dari file NPZ
df = np.load("/kaggle/input/finbert-embeddings-sectoral/finbert_embeddings_all_sectors.npz", allow_pickle=True)
dates = df["Only_Date"]
titles = df["Title_Translated"]
sectors = df["Sector"]
embeddings = df["Embedding"] 

### Initiate autoencoder model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, IterableDataset
import numpy as np
from tqdm import tqdm

class Seq2SeqAutoencoder(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=768, num_layers=4, dropout=0.3):
        super(Seq2SeqAutoencoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Encoder
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)

        # Decoder
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, input_dim)  # Output layer ke dimensi asli

    def forward(self, x):
        batch_size, seq_len, _ = x.shape

        # Encoder
        _, (hidden, cell) = self.encoder(x)

        # Decoder
        decoder_input = torch.zeros(batch_size, 1, self.hidden_dim).to(x.device)  # Input pertama (zero vector)
        outputs = []

        for _ in range(seq_len):
            out, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            pred = self.fc_out(out)
            outputs.append(pred)
            decoder_input = pred

        outputs = torch.cat(outputs, dim=1)  # [batch_size, seq_len, input_dim]
        return outputs, hidden  # Hidden = Thought Vector

# 🔹 Inisialisasi Model, Optimizer, Loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2SeqAutoencoder().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Transform embedding to thought vector

model location: /kaggle/input/autoencoder-final/pytorch/default/1/finbert_autoencoder_lstm (3).pth

In [None]:
import torch
import numpy as np
import pandas as pd


# Load model & pindahkan ke device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2SeqAutoencoder()
model.load_state_dict(torch.load("/kaggle/input/autoencoder-final/pytorch/default/1/finbert_autoencoder_lstm (3).pth", map_location=device))
model.to(device)
model.eval()


# 3️⃣ Inferensi dalam batch (menghindari OOM)
BATCH_SIZE = 256 
thought_vectors_list = []

with torch.no_grad():
    for i in range(0, len(embeddings), BATCH_SIZE):
        batch_embeddings = torch.tensor(embeddings[i:i+BATCH_SIZE], dtype=torch.float32).to(device)

        # Ambil hanya hidden state dari model
        _, batch_thought_vectors = model(batch_embeddings)  
        # Ubah dimensi jadi (batch_size, 4, 768)
        batch_thought_vectors = batch_thought_vectors.permute(1, 0, 2).cpu().numpy()
        thought_vectors_list.append(batch_thought_vectors)


# 4️⃣ Gabungkan semua batch
thought_vectors = np.concatenate(thought_vectors_list, axis=0)  

# 5️⃣ Simpan ke dataframe
df_thought = pd.DataFrame({
    "date": dates,
    "title": titles,
    "sector": sectors,
    "thought_vector": list(thought_vectors) 
})

print("Jumlah Data:", len(df_thought))
df_thought.to_pickle("thought_vectors_finbert_sectoral.pkl")
print("✅ Inferensi selesai! Thought vectors disimpan dalam dataframe.")


In [None]:
df_thought['thought_vector'][94000].shape

# Save thought vector to Kaggle Dataset

In [None]:
import os
import json

# ✅ Define dataset variables
DATASET_FOLDER = "/kaggle/working/thought-vector-sectoral"  
DATASET_NAME = "thought-vector-sectoral"  
USERNAME = "anggraininovi"

# ✅ Create dataset folder
os.makedirs(DATASET_FOLDER, exist_ok=True)

# ✅ Move necessary files to the dataset folder
!mv /kaggle/working/thought_vectors_finbert_sectoral.pkl "{DATASET_FOLDER}/"

# ✅ Read Kaggle API credentials
with open("/kaggle/input/d/anggraininovi/kaggle-json/kaggle (2).json") as f:
    kaggle_creds = json.load(f)

os.environ["KAGGLE_USERNAME"] = kaggle_creds["username"]
os.environ["KAGGLE_KEY"] = kaggle_creds["key"]

# ✅ Initialize Kaggle dataset
!kaggle datasets init -p "{DATASET_FOLDER}"

# ✅ Update dataset metadata
with open(f"{DATASET_FOLDER}/dataset-metadata.json") as f:
    dataset_meta = json.load(f)

dataset_meta["id"] = f"{USERNAME}/{DATASET_NAME}"
dataset_meta["title"] = DATASET_NAME

with open(f"{DATASET_FOLDER}/dataset-metadata.json", "w") as outfile:
    json.dump(dataset_meta, outfile)

# ✅ Upload dataset to Kaggle
!kaggle datasets create -p "{DATASET_FOLDER}" --dir-mode zip
