In [1]:
# Loading libraries

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
import numpy as np

## **TODO:**
- include casing features
- fix the all Os problem
- more plots -> conf matrix


**mistakes:**
- overfitting -> regularization
- all Os problem


**things to try:**
- seaborn sns plots
- regularization
- adjust model parameters
- adjust model in itself
- batch normalization
- SMOT
- class weights

# Model

In [2]:
# Define the model
class NLPModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim, num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim):
        super(NLPModel, self).__init__()

        # Embedding layers
        self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim)  # Pre-trained, frozen
        self.pos_embeddings = nn.Embedding(num_pos_tags, pos_embedding_dim)
        self.lemma_embedding = nn.Embedding(num_embeddings_lemma, lemma_embedding_dim)

        # Concatenation dimension
        concat_dim = embedding_dim + pos_embedding_dim + lemma_embedding_dim

        # Dense layer prior to LSTM
        self.dense = nn.Linear(concat_dim, hidden_dim)

        # LSTM layer
        self.lstm = nn.LSTM(hidden_dim, lstm_out_dim, batch_first=True, bidirectional=True)

        # Output layer
        self.output_layer = nn.Linear(2 * lstm_out_dim, output_dim)  # Correct input dimension

        # Dropout layer
        self.dropout = nn.Dropout(0.25)

    def forward(self, x, pos, lemma):
        # Embed each input type
        x = self.word_embeddings(x)
        pos = self.pos_embeddings(pos)
        lemma = self.lemma_embedding(lemma)

        # Concatenate embeddings
        x = torch.cat((x, pos, lemma), dim=-1)

        # Apply dense and activation
        x = torch.tanh(self.dense(x))

        # Apply dropout
        x = self.dropout(x)

        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Output layer
        output = self.output_layer(lstm_out)

        return output

# Training

## Dataloading

In [3]:
 # Load the CSV file
file_path = '/content/feature_extracted_data.csv'
df = pd.read_csv(file_path)

# Encoding categorical columns
word_encoder = LabelEncoder()
pos_encoder = LabelEncoder()
lemma_encoder = LabelEncoder()
tag_encoder = LabelEncoder()

df['Word_idx'] = word_encoder.fit_transform(df['Word'])
df['POS_idx'] = pos_encoder.fit_transform(df['POS'])
df['Lemma_idx'] = lemma_encoder.fit_transform(df['LEMMA'])
df['Tag_idx'] = tag_encoder.fit_transform(df['Tag'])

# Add 'unknown' index for words, lemmas, and tags
def add_unknown_to_encoder(encoder):
    classes = list(encoder.classes_)
    if 'unknown' not in classes:
        classes.append('unknown')
        encoder.classes_ = np.array(classes)
    return encoder

word_encoder = add_unknown_to_encoder(word_encoder)
lemma_encoder = add_unknown_to_encoder(lemma_encoder)
tag_encoder = add_unknown_to_encoder(tag_encoder)

# Function to map words/lemmas/tags to their indices, using 'unknown' for unseen labels
def map_to_index(encoder, items, unknown_label='unknown'):
    label_map = {label: idx for idx, label in enumerate(encoder.classes_)}
    return [label_map.get(item, label_map[unknown_label]) for item in items]

df['Word_idx'] = map_to_index(word_encoder, df['Word'])
df['Lemma_idx'] = map_to_index(lemma_encoder, df['LEMMA'])
df['Tag_idx'] = map_to_index(tag_encoder, df['Tag'])

In [4]:
# Custom Dataset class
class NLPCustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        word = self.dataframe.iloc[idx]['Word_idx']
        pos = self.dataframe.iloc[idx]['POS_idx']
        lemma = self.dataframe.iloc[idx]['Lemma_idx']
        tag = self.dataframe.iloc[idx]['Tag_idx']

        sample = {
            'word': torch.tensor(word, dtype=torch.long),
            'pos': torch.tensor(pos, dtype=torch.long),
            'lemma': torch.tensor(lemma, dtype=torch.long),
            'tag': torch.tensor(tag, dtype=torch.long)
        }
        return sample

# Splitting data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Creating Dataset and DataLoader for training and validation
train_dataset = NLPCustomDataset(train_df)
val_dataset = NLPCustomDataset(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## Parameter initialization

In [5]:
# Initializing the model
model = NLPModel(
    num_embeddings=len(word_encoder.classes_),
    embedding_dim=50,
    num_pos_tags=len(pos_encoder.classes_),
    pos_embedding_dim=10,
    num_embeddings_lemma=len(lemma_encoder.classes_),
    lemma_embedding_dim=10,
    hidden_dim=100,
    lstm_out_dim=50,
    output_dim=len(tag_encoder.classes_)
)

# Defining loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
train_losses = []
val_losses = []

## Trianing Loop

In [6]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        words = batch['word']
        pos = batch['pos']
        lemma = batch['lemma']
        tags = batch['tag']

        optimizer.zero_grad()

        # Forward pass
        outputs = model(words, pos, lemma)

        # Compute loss
        loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    train_losses.append(avg_loss)
    print(f'\nEpoch {epoch+1}/{num_epochs}, Loss: {avg_loss}')

    # Validation step (optional)
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            words = batch['word']
            pos = batch['pos']
            lemma = batch['lemma']
            tags = batch['tag']

            # Forward pass
            outputs = model(words, pos, lemma)

            # Compute loss
            loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.data, -1)
            correct_predictions += (predicted == tags).sum().item()
            total_predictions += tags.numel()

            all_val_preds.extend(predicted.cpu().numpy().flatten())
            all_val_labels.extend(tags.cpu().numpy().flatten())

    avg_val_loss = val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)
    val_accuracy = correct_predictions / total_predictions

    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(all_val_labels, all_val_preds, average='weighted',zero_division=0)
    print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}')
    print(f'Validation Precision: {val_precision}, Recall: {val_recall}, F1 Score: {val_f1}')

# + metricas: en que me equivoco que clase

Epoch 1/10, Loss: 0.3335518239314328
Validation Loss: 0.28256620940131444, Validation Accuracy: 0.9228195867132453
Validation Precision: 0.9056098150795748, Recall: 0.9228195867132453, F1 Score: 0.901674095434056


KeyboardInterrupt: 

In [None]:
# Plotting training and validation losses
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss over Epochs')
plt.show()

In [None]:
# Print some example predictions
print("\nSample Predictions:")
for i in range(100):
    true_label = tag_encoder.inverse_transform([all_val_labels[i]])[0]
    predicted_label = tag_encoder.inverse_transform([all_val_preds[i]])[0]
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")

# Testing

## Dataloading Test Set

In [None]:
# Load the test CSV file
test_file_path = '/content/feature_extracted_dataTEST.csv'
df_test = pd.read_csv(test_file_path)

# Encoding categorical columns
word_encoder = LabelEncoder()
pos_encoder = LabelEncoder()
lemma_encoder = LabelEncoder()
tag_encoder = LabelEncoder()

df_test['Word_idx'] = word_encoder.fit_transform(df_test['Word'])
df_test['POS_idx'] = pos_encoder.fit_transform(df_test['POS'])
df_test['Lemma_idx'] = lemma_encoder.fit_transform(df_test['LEMMA'])
df_test['Tag_idx'] = tag_encoder.fit_transform(df_test['Tag'])

word_encoder = add_unknown_to_encoder(word_encoder)
lemma_encoder = add_unknown_to_encoder(lemma_encoder)
tag_encoder = add_unknown_to_encoder(tag_encoder)

df_test['Word_idx'] = map_to_index(word_encoder, df_test['Word'])
df_test['Lemma_idx'] = map_to_index(lemma_encoder, df_test['LEMMA'])
df_test['Tag_idx'] = map_to_index(tag_encoder, df_test['Tag'])


# Custom Dataset class for test data
class NLPCustomTestDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        word = self.dataframe.iloc[idx]['Word_idx']
        pos = self.dataframe.iloc[idx]['POS_idx']
        lemma = self.dataframe.iloc[idx]['Lemma_idx']
        tag = self.dataframe.iloc[idx]['Tag_idx']

        sample = {
            'word': torch.tensor(word, dtype=torch.long),
            'pos': torch.tensor(pos, dtype=torch.long),
            'lemma': torch.tensor(lemma, dtype=torch.long),
            'tag': torch.tensor(tag, dtype=torch.long)
        }
        return sample

# Create Dataset and DataLoader for test data
test_dataset = NLPCustomTestDataset(df_test)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Model Evaluation

In [None]:
# Evaluate the model on the test data
model.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0
all_test_preds = []
all_test_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        words = batch['word']
        pos = batch['pos']
        lemma = batch['lemma']
        tags = batch['tag']

        # Forward pass
        outputs = model(words, pos, lemma)

        # Compute loss
        loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))
        test_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, -1)
        correct_predictions += (predicted == tags).sum().item()
        total_predictions += tags.numel()

        all_test_preds.extend(predicted.cpu().numpy().flatten())
        all_test_labels.extend(tags.cpu().numpy().flatten())

avg_test_loss = test_loss / len(test_dataloader)
test_accuracy = correct_predictions / total_predictions

test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(all_test_labels, all_test_preds, average='weighted', zero_division=0)

print(f'Test Loss: {avg_test_loss}, Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}')

In [None]:
# Print some example predictions
print("\nSample Predictions:")
for i in range(1000):
    true_label = tag_encoder.inverse_transform([all_test_labels[i]])[0]
    predicted_label = tag_encoder.inverse_transform([all_test_preds[i]])[0]
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")