In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

# Dataset Class
class YahooAnswersDataset(Dataset):
    def __init__(self, data, labels, word_embeddings):
        self.data = data
        self.labels = labels
        self.word_embeddings = word_embeddings

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.labels[idx]
        vectorized_text = self.vectorize_text(text)
        return torch.tensor(vectorized_text, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

    def vectorize_text(self, text):
        if not isinstance(text, str):  # Periksa apakah teks adalah string
            text = ""  # Jika bukan, ubah menjadi string kosong
        tokens = text.split()
        vectors = [self.word_embeddings[word] for word in tokens if word in self.word_embeddings]
        if not vectors:
            return np.zeros(100)  # Embedding size = 100
        return np.mean(vectors, axis=0)


# Model Class
class FastTextModel(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(FastTextModel, self).__init__()
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        return self.fc(x)

# Load Data
# Menambahkan nama kolom secara manual
column_names = ['label', 'question_title', 'question_content', 'best_answer']

# Membaca file CSV
train_data = pd.read_csv('/content/train.csv', names=column_names)
test_data = pd.read_csv('/content/test.csv', names=column_names)

# Gabungkan kolom teks menjadi satu
train_data['text'] = train_data['question_title'] + " " + train_data['question_content']
test_data['text'] = test_data['question_title'] + " " + test_data['question_content']

# Mengatur ulang label agar dimulai dari 0
train_data['label'] = train_data['label'] - 1
test_data['label'] = test_data['label'] - 1

# Validasi jumlah kelas
num_classes = train_data['label'].nunique()
print(f"Jumlah kelas unik: {num_classes}")


# Load Word Embeddings
embedding_path = '/content/glove.6B.100d.txt'
embeddings_index = {}
with open(embedding_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare Dataset
X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['label'], test_size=0.2, random_state=42)
train_dataset = YahooAnswersDataset(X_train.tolist(), y_train.tolist(), embeddings_index)
val_dataset = YahooAnswersDataset(X_val.tolist(), y_val.tolist(), embeddings_index)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Model, Loss, Optimizer
model = FastTextModel(embedding_dim=100, num_classes=train_data['label'].nunique())
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {correct/total:.4f}")

train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=3)

# Evaluation on Test Data
def evaluate_model(model, test_data, word_embeddings):
    test_data['text'] = test_data['question_title'] + " " + test_data['question_content']
    test_dataset = YahooAnswersDataset(test_data['text'].tolist(), test_data['label'].tolist(), word_embeddings)
    test_loader = DataLoader(test_dataset, batch_size=32)

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {correct / total:.4f}")

evaluate_model(model, test_data, embeddings_index)

# Save the Model
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

save_model(model, '/content/fasttext_model.pth')


Jumlah kelas unik: 10
Epoch 1/3, Train Loss: 1.8415, Val Loss: 1.7947, Val Accuracy: 0.3713
Epoch 2/3, Train Loss: 1.7894, Val Loss: 1.7863, Val Accuracy: 0.3735
Epoch 3/3, Train Loss: 1.7842, Val Loss: 1.7833, Val Accuracy: 0.3754
Test Accuracy: 0.3775
Model saved to /content/fasttext_model.pth
