In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

%run general_preprocessing.ipynb
# Define the Dataset class
class TextDataset(Dataset):
    def __init__(self, discussions, labels):
        self.discussions = discussions
        self.labels = labels

    def __len__(self):
        return len(self.discussions)

    def __getitem__(self, idx):
        return self.discussions[idx], self.labels[idx]

# Custom Transformer-based model
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, num_classes, max_len):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_size))
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

        self.fc = nn.Linear(embed_size, num_classes)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.fc(x)
        return x

# Prepare the data
def prepare_data(data):
    label_encoder = LabelEncoder()
    data['Category'] = label_encoder.fit_transform(data['Category'])
    labels = data['Category'].values

    tokenized_texts = [text.split() for text in data['Discussion']]
    vocab = {word: idx for idx, word in enumerate(set(word for sentence in tokenized_texts for word in sentence), start=1)}
    vocab['<PAD>'] = 0
    max_len = max(len(sentence) for sentence in tokenized_texts)

    # Convert text to sequences
    sequences = [[vocab[word] for word in sentence] for sentence in tokenized_texts]
    sequences = [seq + [vocab['<PAD>']] * (max_len - len(seq)) for seq in sequences]

    return torch.tensor(sequences, dtype=torch.long), torch.tensor(labels, dtype=torch.long), vocab, max_len

# Load and preprocess data
data = pd.read_csv("F:/MY_Projects/Deep_Learning_project/data/train.csv")
data = process_data(data)
X, y, vocab, max_len = prepare_data(data)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Model parameters
vocab_size = len(vocab)
embed_size = 128
num_heads = 4
num_layers = 2
num_classes = 5

# Initialize and train the model
model = TransformerClassifier(vocab_size, embed_size, num_heads, num_layers, num_classes, max_len)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

        # Validation
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                predictions = torch.argmax(outputs, dim=1)
                correct += (predictions == y_batch).sum().item()
                total += y_batch.size(0)

        print(f"Validation Accuracy: {correct / total:.4f}")

train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_set['Discussion'] = lst


KeyboardInterrupt: 