In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch.optim as optim
from sklearn.model_selection import train_test_split

assert torch.cuda.is_available(), "GPU is not enabled"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# Sample DataFrame from the provided CSV
df = pd.read_csv('/content/feature_extracted_data.csv')
print("Column Names:", df.columns)

Column Names: Index(['Word', 'Initial Scopes', 'Final Scopes', 'Tag', 'POS', 'LEMMA',
       'NUMBER', 'Contains NUMBER', 'Maj NUMBER', 'text_id'],
      dtype='object')


In [4]:
# Encoding categorical columns
word_encoder = LabelEncoder()
pos_encoder = LabelEncoder()
lemma_encoder = LabelEncoder()
tag_encoder = LabelEncoder()

df['Word_idx'] = word_encoder.fit_transform(df['Word'])
df['POS_idx'] = pos_encoder.fit_transform(df['POS'])
df['Lemma_idx'] = lemma_encoder.fit_transform(df['LEMMA'])
df['Tag_idx'] = tag_encoder.fit_transform(df['Tag'])

# Verify new columns
print("Encoded Columns:", df[['Word', 'Word_idx', 'POS', 'POS_idx', 'LEMMA', 'Lemma_idx', 'Tag', 'Tag_idx']].head())

# Custom Dataset class
class NLPCustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        word = self.dataframe.iloc[idx]['Word_idx']
        pos = self.dataframe.iloc[idx]['POS_idx']
        lemma = self.dataframe.iloc[idx]['Lemma_idx']
        tag = self.dataframe.iloc[idx]['Tag_idx']

        sample = {
            'word': torch.tensor(word, dtype=torch.long),
            'pos': torch.tensor(pos, dtype=torch.long),
            'lemma': torch.tensor(lemma, dtype=torch.long),
            'tag': torch.tensor(tag, dtype=torch.long)
        }
        return sample

Encoded Columns:        Word  Word_idx    POS  POS_idx     LEMMA  Lemma_idx Tag  Tag_idx
0  paciente     21226   NOUN        7  paciente      10546   O        2
1       que     23115  SCONJ       14       que      11595   O        2
2   ingresa     17572   VERB       16  ingresar       8590   O        2
3        de     12698    ADP        1        de       5845   O        2
4     forma     15630   NOUN        7     forma       7489   O        2


In [5]:
class NLPModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim, num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim):
        super(NLPModel, self).__init__()

        # Embedding layers
        self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim)  # Pre-trained, frozen
        self.pos_embeddings = nn.Embedding(num_pos_tags, pos_embedding_dim)
        self.lemma_embedding = nn.Embedding(num_embeddings_lemma, lemma_embedding_dim)

        # Concatenation dimension
        concat_dim = embedding_dim + pos_embedding_dim + lemma_embedding_dim

        # Dense layer prior to LSTM
        self.dense = nn.Linear(concat_dim, hidden_dim)

        # LSTM layer
        self.lstm = nn.LSTM(hidden_dim, lstm_out_dim, batch_first=True, bidirectional=True)

        # Output layer
        self.output_layer = nn.Linear(2 * lstm_out_dim, output_dim)  # Correct input dimension

        # Dropout layer
        self.dropout = nn.Dropout(0.25)

    def forward(self, x, pos, lemma):
        # Embed each input type
        x = self.word_embeddings(x)
        pos = self.pos_embeddings(pos)
        lemma = self.lemma_embedding(lemma)  # Corrected from 'case' to 'lemma'

        # Concatenate embeddings
        x = torch.cat((x, pos, lemma), dim=-1)

        # Apply dense and activation
        x = torch.tanh(self.dense(x))

        # Apply dropout
        x = self.dropout(x)

        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Output layer
        output = self.output_layer(lstm_out)

        return output

In [6]:
# Splitting data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Creating Dataset and DataLoader for training and validation
train_dataset = NLPCustomDataset(train_df)
val_dataset = NLPCustomDataset(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Initializing the model
model = NLPModel(
    num_embeddings=len(word_encoder.classes_),
    embedding_dim=50,
    num_pos_tags=len(pos_encoder.classes_),
    pos_embedding_dim=10,
    num_embeddings_lemma=len(lemma_encoder.classes_),
    lemma_embedding_dim=10,
    hidden_dim=100,
    lstm_out_dim=50,
    output_dim=len(tag_encoder.classes_)
)

print(model)

# Defining loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10


NLPModel(
  (word_embeddings): Embedding(27100, 50)
  (pos_embeddings): Embedding(18, 10)
  (lemma_embedding): Embedding(13927, 10)
  (dense): Linear(in_features=70, out_features=100, bias=True)
  (lstm): LSTM(100, 50, batch_first=True, bidirectional=True)
  (output_layer): Linear(in_features=100, out_features=5, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


In [7]:

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        words = batch['word']
        pos = batch['pos']
        lemma = batch['lemma']
        tags = batch['tag']

        optimizer.zero_grad()

        # Forward pass
        outputs = model(words, pos, lemma)

        # Compute loss
        loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}')

    # Validation step (optional)
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in val_dataloader:
            words = batch['word']
            pos = batch['pos']
            lemma = batch['lemma']
            tags = batch['tag']

            # Forward pass
            outputs = model(words, pos, lemma)

            # Compute loss
            loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.data, -1)
            correct_predictions += (predicted == tags).sum().item()
            total_predictions += tags.numel()

    avg_val_loss = val_loss / len(val_dataloader)
    val_accuracy = correct_predictions / total_predictions
    print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}')

Epoch 1/10, Loss: 0.3359713606596374
Epoch 2/10, Loss: 0.26924997499720293
Epoch 3/10, Loss: 0.2444174414001742
Epoch 4/10, Loss: 0.22818499560115704
Epoch 5/10, Loss: 0.2168491954915005
Epoch 6/10, Loss: 0.2084509050277294
Epoch 7/10, Loss: 0.20088725881273928
Epoch 8/10, Loss: 0.19471634228048013
Epoch 9/10, Loss: 0.18973186892694824
Epoch 10/10, Loss: 0.18609441715404446
