In [None]:
# Loading libraries

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix

## **TODO:**
- include casing features
- fix the all Os problem
- more plots -> conf matrix
- look for new metrics


**mistakes:**
- overfitting -> regularization
- all Os problem


**things to try:**
- seaborn sns plots
- regularization
- adjust model parameters
- adjust model in itself
- batch normalization
- SMOT
- class weights

# Model

In [None]:
# Define the model
class NLPModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim, num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim):
        super(NLPModel, self).__init__()

        # Embedding layers
        self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim)  # Pre-trained, frozen
        self.pos_embeddings = nn.Embedding(num_pos_tags, pos_embedding_dim)
        self.lemma_embedding = nn.Embedding(num_embeddings_lemma, lemma_embedding_dim)

        # Concatenation dimension
        concat_dim = embedding_dim + pos_embedding_dim + lemma_embedding_dim

        # Dense layer prior to LSTM
        self.dense = nn.Linear(concat_dim, hidden_dim)

        # LSTM layer
        self.lstm = nn.LSTM(hidden_dim, lstm_out_dim, batch_first=True, bidirectional=True)

        # Output layer
        self.output_layer = nn.Linear(2 * lstm_out_dim, output_dim)  # Correct input dimension

        # Dropout layer
        self.dropout = nn.Dropout(0.25)

    def forward(self, x, pos, lemma):
        # Embed each input type
        x = self.word_embeddings(x)
        pos = self.pos_embeddings(pos)
        lemma = self.lemma_embedding(lemma)

        # Concatenate embeddings
        x = torch.cat((x, pos, lemma), dim=-1)

        # Apply dense and activation
        x = torch.tanh(self.dense(x))

        # Apply dropout
        x = self.dropout(x)

        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Output layer
        output = self.output_layer(lstm_out)

        return output

## Dataloading

In [None]:
 # Load the CSV file
file_path = '/content/feature_extracted_data.csv'
df = pd.read_csv(file_path)

# Encoding categorical columns
word_encoder = LabelEncoder()
pos_encoder = LabelEncoder()
lemma_encoder = LabelEncoder()
tag_encoder = LabelEncoder()

df['Word_idx'] = word_encoder.fit_transform(df['Word'])
df['POS_idx'] = pos_encoder.fit_transform(df['POS'])
df['Lemma_idx'] = lemma_encoder.fit_transform(df['LEMMA'])
df['Tag_idx'] = tag_encoder.fit_transform(df['Tag'])

# Add 'unknown' index for words, lemmas, and tags
def add_unknown_to_encoder(encoder):
    classes = list(encoder.classes_)
    if 'unknown' not in classes:
        classes.append('unknown')
        encoder.classes_ = np.array(classes)
    return encoder

word_encoder = add_unknown_to_encoder(word_encoder)
lemma_encoder = add_unknown_to_encoder(lemma_encoder)
tag_encoder = add_unknown_to_encoder(tag_encoder)

# Function to map words/lemmas/tags to their indices, using 'unknown' for unseen labels
def map_to_index(encoder, items, unknown_label='unknown'):
    label_map = {label: idx for idx, label in enumerate(encoder.classes_)}
    return [label_map.get(item, label_map[unknown_label]) for item in items]

df['Word_idx'] = map_to_index(word_encoder, df['Word'])
df['Lemma_idx'] = map_to_index(lemma_encoder, df['LEMMA'])
df['Tag_idx'] = map_to_index(tag_encoder, df['Tag'])

In [None]:
# Custom Dataset class
class NLPCustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        word = self.dataframe.iloc[idx]['Word_idx']
        pos = self.dataframe.iloc[idx]['POS_idx']
        lemma = self.dataframe.iloc[idx]['Lemma_idx']
        tag = self.dataframe.iloc[idx]['Tag_idx']

        sample = {
            'word': torch.tensor(word, dtype=torch.long),
            'pos': torch.tensor(pos, dtype=torch.long),
            'lemma': torch.tensor(lemma, dtype=torch.long),
            'tag': torch.tensor(tag, dtype=torch.long)
        }
        return sample

# Splitting data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Creating Dataset and DataLoader for training and validation
train_dataset = NLPCustomDataset(train_df)
val_dataset = NLPCustomDataset(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## Parameter initialization

In [None]:
# Initializing the model
model = NLPModel(
    num_embeddings=len(word_encoder.classes_),
    embedding_dim=50,
    num_pos_tags=len(pos_encoder.classes_),
    pos_embedding_dim=10,
    num_embeddings_lemma=len(lemma_encoder.classes_),
    lemma_embedding_dim=10,
    hidden_dim=100,
    lstm_out_dim=50,
    output_dim=len(tag_encoder.classes_)
)

# Defining loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
train_losses = []
val_losses = []

## Trianing Loop

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        words = batch['word']
        pos = batch['pos']
        lemma = batch['lemma']
        tags = batch['tag']

        optimizer.zero_grad()

        # Forward pass
        outputs = model(words, pos, lemma)

        # Compute loss
        loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    train_losses.append(avg_loss)
    print(f'\nEpoch {epoch+1}/{num_epochs}, Loss: {avg_loss}')

    # Validation step (optional)
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            words = batch['word']
            pos = batch['pos']
            lemma = batch['lemma']
            tags = batch['tag']

            # Forward pass
            outputs = model(words, pos, lemma)

            # Compute loss
            loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.data, -1)
            correct_predictions += (predicted == tags).sum().item()
            total_predictions += tags.numel()

            all_val_preds.extend(predicted.cpu().numpy().flatten())
            all_val_labels.extend(tags.cpu().numpy().flatten())

    avg_val_loss = val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)
    val_accuracy = correct_predictions / total_predictions

    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(all_val_labels, all_val_preds, average='weighted',zero_division=0)
    print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}')
    print(f'Validation Precision: {val_precision}, Recall: {val_recall}, F1 Score: {val_f1}')


Epoch 1/10, Loss: 0.33961817703871205
Validation Loss: 0.2830888888283605, Validation Accuracy: 0.9219610397300018
Validation Precision: 0.9058248618049888, Recall: 0.9219610397300018, F1 Score: 0.8987847086965588

Epoch 2/10, Loss: 0.26978799859434544
Validation Loss: 0.26071298103709, Validation Accuracy: 0.9267866658771982
Validation Precision: 0.9101412085555345, Recall: 0.9267866658771982, F1 Score: 0.9087818498924212

Epoch 3/10, Loss: 0.243847495203398
Validation Loss: 0.2537777282784439, Validation Accuracy: 0.9293327017585411
Validation Precision: 0.913810326619613, Recall: 0.9293327017585411, F1 Score: 0.9129347938201985

Epoch 4/10, Loss: 0.2288687646026447
Validation Loss: 0.25366043979231495, Validation Accuracy: 0.9277636331339926
Validation Precision: 0.9149763806017059, Recall: 0.9277636331339926, F1 Score: 0.9149609615679594

Epoch 5/10, Loss: 0.2169833548314977
Validation Loss: 0.2581630850944555, Validation Accuracy: 0.9244478654745693
Validation Precision: 0.911949

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(all_val_labels, all_val_preds)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Val Predicted Labels')
plt.ylabel('Val True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plotting training and validation losses
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss over Epochs')
plt.show()

In [None]:
# Print some example predictions
print("\nSample Predictions:")
for i in range(100):
    true_label = tag_encoder.inverse_transform([all_val_labels[i]])[0]
    predicted_label = tag_encoder.inverse_transform([all_val_preds[i]])[0]
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")

## Testing

## Dataloading Test Set

In [None]:
# Load the test CSV file
test_file_path = '/content/feature_extracted_dataTEST.csv'
df_test = pd.read_csv(test_file_path)

# Encoding categorical columns
word_encoder = LabelEncoder()
pos_encoder = LabelEncoder()
lemma_encoder = LabelEncoder()
tag_encoder = LabelEncoder()

df_test['Word_idx'] = word_encoder.fit_transform(df_test['Word'])
df_test['POS_idx'] = pos_encoder.fit_transform(df_test['POS'])
df_test['Lemma_idx'] = lemma_encoder.fit_transform(df_test['LEMMA'])
df_test['Tag_idx'] = tag_encoder.fit_transform(df_test['Tag'])

word_encoder = add_unknown_to_encoder(word_encoder)
lemma_encoder = add_unknown_to_encoder(lemma_encoder)
tag_encoder = add_unknown_to_encoder(tag_encoder)

df_test['Word_idx'] = map_to_index(word_encoder, df_test['Word'])
df_test['Lemma_idx'] = map_to_index(lemma_encoder, df_test['LEMMA'])
df_test['Tag_idx'] = map_to_index(tag_encoder, df_test['Tag'])


# Custom Dataset class for test data
class NLPCustomTestDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        word = self.dataframe.iloc[idx]['Word_idx']
        pos = self.dataframe.iloc[idx]['POS_idx']
        lemma = self.dataframe.iloc[idx]['Lemma_idx']
        tag = self.dataframe.iloc[idx]['Tag_idx']

        sample = {
            'word': torch.tensor(word, dtype=torch.long),
            'pos': torch.tensor(pos, dtype=torch.long),
            'lemma': torch.tensor(lemma, dtype=torch.long),
            'tag': torch.tensor(tag, dtype=torch.long)
        }
        return sample

# Create Dataset and DataLoader for test data
test_dataset = NLPCustomTestDataset(df_test)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Model Evaluation

In [None]:
# Evaluate the model on the test data
model.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0
all_test_preds = []
all_test_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        words = batch['word']
        pos = batch['pos']
        lemma = batch['lemma']
        tags = batch['tag']

        # Forward pass
        outputs = model(words, pos, lemma)

        # Compute loss
        loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))
        test_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, -1)
        correct_predictions += (predicted == tags).sum().item()
        total_predictions += tags.numel()

        all_test_preds.extend(predicted.cpu().numpy().flatten())
        all_test_labels.extend(tags.cpu().numpy().flatten())

avg_test_loss = test_loss / len(test_dataloader)
test_accuracy = correct_predictions / total_predictions

test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(all_test_labels, all_test_preds, average='weighted', zero_division=0)

print(f'Test Loss: {avg_test_loss}, Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}')

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(all_test_labels, all_test_preds)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu')
plt.xlabel('Test Predicted Labels')
plt.ylabel('Test True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Print some example predictions
print("\nSample Predictions:")
for i in range(100):
    true_label = tag_encoder.inverse_transform([all_test_labels[i]])[0]
    predicted_label = tag_encoder.inverse_transform([all_test_preds[i]])[0]
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")

# Revised Model

Including
- Regularization
- Batch Normalization
- Class weights



## Model - Revised

In [None]:
class NLPModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim, num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim):
        super(NLPModel, self).__init__()

        # Embedding layers
        self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim)  # Pre-trained, frozen
        self.pos_embeddings = nn.Embedding(num_pos_tags, pos_embedding_dim)
        self.lemma_embedding = nn.Embedding(num_embeddings_lemma, lemma_embedding_dim)

        # Concatenation dimension
        concat_dim = embedding_dim + pos_embedding_dim + lemma_embedding_dim

        # Dense layer prior to LSTM
        self.dense = nn.Linear(concat_dim, hidden_dim)
        self.batch_norm1 = nn.BatchNorm1d(concat_dim)  # Batch normalization

        # LSTM layer
        self.lstm = nn.LSTM(hidden_dim, lstm_out_dim, batch_first=True, bidirectional=True)
        self.batch_norm2 = nn.BatchNorm1d(2 * lstm_out_dim)  # Batch normalization

        # Output layer
        self.output_layer = nn.Linear(2 * lstm_out_dim, output_dim)

        # Dropout layer
        self.dropout = nn.Dropout(0.5)  # Increased dropout for regularization

    def forward(self, x, pos, lemma):
        # Embed each input type
        x = self.word_embeddings(x)
        pos = self.pos_embeddings(pos)
        lemma = self.lemma_embedding(lemma)

        # Concatenate embeddings
        x = torch.cat((x, pos, lemma), dim=-1)

        # Apply batch normalization and dense layer
        x = self.batch_norm1(x)
        x = torch.tanh(self.dense(x))

        # Apply dropout
        x = self.dropout(x)

        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Apply batch normalization after LSTM
        lstm_out = self.batch_norm2(lstm_out)

        # Output layer
        output = self.output_layer(lstm_out)

        return output

## Revised Data Loading

In [None]:
# Load data and preprocess
train_data = pd.read_csv('/content/feature_extracted_data.csv')
test_data = pd.read_csv('/content/feature_extracted_dataTEST.csv')

# Encode labels
le = LabelEncoder()
train_data['Tag'] = le.fit_transform(train_data['Tag'])
test_data['Tag'] = le.transform(test_data['Tag'])


# Split training data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Create datasets and dataloaders
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            'word': torch.tensor(item['Word'], dtype=torch.long),
            'pos': torch.tensor(item['POS'], dtype=torch.long),
            'lemma': torch.tensor(item['LEMMA'], dtype=torch.long),
            'label': torch.tensor(item['Tag'], dtype=torch.long)
        }

train_dataset = CustomDataset(train_data)
val_dataset = CustomDataset(val_data)
test_dataset = CustomDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
# Plotting class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Tag', data=train_data)
plt.title('Class Distribution in Training Data')
plt.show()

In [None]:
# Initialize model, criterion, and optimizer
num_embeddings = train_data['Word'].nunique()
embedding_dim = 100
num_pos_tags = train_data['POS'].nunique()
pos_embedding_dim = 25
num_embeddings_lemma = train_data['LEMMA'].nunique()
lemma_embedding_dim = 25
hidden_dim = 128
lstm_out_dim = 64
output_dim = len(le.classes_)


model = NLPModel(num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim, num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim)

class_weights = torch.tensor([0.1, 0.9], dtype=torch.float)  # Adjust class weights based on distribution
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Revised Training Loop

In [None]:
# Training loop
num_epochs = 10
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch['word'], batch['pos'], batch['lemma'])
        loss = criterion(outputs.view(-1, output_dim), batch['label'].view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    train_loss = running_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation loss
    model.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(batch['word'], batch['pos'], batch['lemma'])
            loss = criterion(outputs.view(-1, output_dim), batch['label'].view(-1))
            val_running_loss += loss.item()

    val_loss = val_running_loss / len(val_loader)
    val_losses.append(val_loss)

    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss}, Validation Loss: {val_loss}')

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score

# Evaluation function
def evaluate(loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            outputs = model(batch['word'], batch['pos'], batch['lemma'])
            preds = torch.argmax(outputs, dim=2)
            all_preds.extend(preds.view(-1).numpy())
            all_labels.extend(batch['label'].view(-1).numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    accuracy = accuracy_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)

In [None]:
# Evaluate on validation set
val_precision, val_recall, val_f1, val_accuracy, val_conf_matrix, val_labels, val_preds = evaluate(val_loader)
print(f'Validation Precision: {val_precision}, Recall: {val_recall}, F1-Score: {val_f1}, Accuracy: {val_accuracy}')

Plotting

In [None]:
# Plot confusion matrix for validation set
plt.figure(figsize=(10, 8))
sns.heatmap(val_conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Validation Predicted Labels')
plt.ylabel('Validation True Labels')
plt.title('Validation Confusion Matrix')
plt.show()

In [None]:
# Plotting training and validation losses
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss over Epochs')
plt.show()

In [None]:
# Print some example predictions for validation set
print("\nSample Predictions for Validation Set:")
for i in range(100):
    true_label = le.inverse_transform([val_labels[i]])[0]
    predicted_label = le.inverse_transform([val_preds[i]])[0]
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")

### Revised Testing

In [None]:
# Evaluate on test set
test_precision, test_recall, test_f1, test_accuracy, test_conf_matrix, test_labels, test_preds = evaluate(test_loader)
print(f'Test Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}, Accuracy: {test_accuracy}')

Plotting

In [None]:
# Plot confusion matrix for test set
plt.figure(figsize=(10, 8))
sns.heatmap(test_conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Test Predicted Labels')
plt.ylabel('Test True Labels')
plt.title('Test Confusion Matrix')
plt.show()

In [None]:

# Print some example predictions for test set
print("\nSample Predictions for Test Set:")
for i in range(100):
    true_label = le.inverse_transform([test_labels[i]])[0]
    predicted_label = le.inverse_transform([test_preds[i]])[0]
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")

# Training with Case Tagging Features

In [None]:
# Load the data
train_data_path = '/content/feature_extracted_data.csv'
test_data_path = '/content/feature_extracted_dataTEST.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Define the dataset class
class NLPDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.word_encoder = LabelEncoder().fit(data['Word'])
        self.pos_encoder = LabelEncoder().fit(data['POS'])
        self.lemma_encoder = LabelEncoder().fit(data['LEMMA'])
        self.tag_encoder = LabelEncoder().fit(data['Tag'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = self.word_encoder.transform([self.data.iloc[idx]['Word']])[0]
        pos = self.pos_encoder.transform([self.data.iloc[idx]['POS']])[0]
        lemma = self.lemma_encoder.transform([self.data.iloc[idx]['LEMMA']])[0]
        number = self.data.iloc[idx]['NUMBER']
        contains_number = self.data.iloc[idx]['Contains NUMBER']
        maj_number = self.data.iloc[idx]['Maj NUMBER']
        tag = self.tag_encoder.transform([self.data.iloc[idx]['Tag']])[0]
        return {
            'word': torch.tensor(word, dtype=torch.long),
            'pos': torch.tensor(pos, dtype=torch.long),
            'lemma': torch.tensor(lemma, dtype=torch.long),
            'number': torch.tensor(number, dtype=torch.float),
            'contains_number': torch.tensor(contains_number, dtype=torch.float),
            'maj_number': torch.tensor(maj_number, dtype=torch.float),
            'tag': torch.tensor(tag, dtype=torch.long)
        }

# Create dataset and dataloader
train_dataset = NLPDataset(train_data)
test_dataset = NLPDataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Define the model
class NLPModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_pos_tags, pos_embedding_dim, num_embeddings_lemma, lemma_embedding_dim, hidden_dim, lstm_out_dim, output_dim):
        super(NLPModel, self).__init__()

        # Embedding layers
        self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim)
        self.pos_embeddings = nn.Embedding(num_pos_tags, pos_embedding_dim)
        self.lemma_embeddings = nn.Embedding(num_embeddings_lemma, lemma_embedding_dim)

        # Feature weight
        self.feature_weight = 0.1  # To reduce their impact

        # Concatenation dimension
        concat_dim = embedding_dim + pos_embedding_dim + lemma_embedding_dim + 3  # 3 for the additional features

        # Dense layer prior to LSTM
        self.dense = nn.Linear(concat_dim, hidden_dim)

        # LSTM layer
        self.lstm = nn.LSTM(hidden_dim, lstm_out_dim, batch_first=True, bidirectional=True)

        # Output layer
        self.output_layer = nn.Linear(2 * lstm_out_dim, output_dim)

    def forward(self, word_input, pos_input, lemma_input, number, contains_number, maj_number):
        word_embeds = self.word_embeddings(word_input)
        pos_embeds = self.pos_embeddings(pos_input)
        lemma_embeds = self.lemma_embeddings(lemma_input)

        # Apply weight to number features
        number = number * self.feature_weight
        contains_number = contains_number * self.feature_weight
        maj_number = maj_number * self.feature_weight

        # Concatenate all features
        combined = torch.cat((word_embeds, pos_embeds, lemma_embeds, number.unsqueeze(-1), contains_number.unsqueeze(-1), maj_number.unsqueeze(-1)), dim=-1)

        # Dense layer
        dense_out = self.dense(combined)

        # LSTM layer
        lstm_out, _ = self.lstm(dense_out)

        # Output layer
        output = self.output_layer(lstm_out)

        return output

In [None]:
# Initialize the model, criterion, and optimizer
model = NLPModel(num_embeddings=len(train_dataset.word_encoder.classes_),
                 embedding_dim=100,
                 num_pos_tags=len(train_dataset.pos_encoder.classes_),
                 pos_embedding_dim=25,
                 num_embeddings_lemma=len(train_dataset.lemma_encoder.classes_),
                 lemma_embedding_dim=25,
                 hidden_dim=128,
                 lstm_out_dim=64,
                 output_dim=len(train_dataset.tag_encoder.classes_))

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with progress bar
num_epochs = 10
batch_size = 16  # Reduced batch size for more frequent updates
from tqdm import tqdm  # Progress bar library

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    num_batches = len(train_dataloader)

    # Using tqdm for the progress bar
    with tqdm(total=num_batches, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for batch_idx, batch in enumerate(train_dataloader):
            words = batch['word']
            pos = batch['pos']
            lemma = batch['lemma']
            number = batch['number']
            contains_number = batch['contains_number']
            maj_number = batch['maj_number']
            tags = batch['tag']

            optimizer.zero_grad()
            outputs = model(words, pos, lemma, number, contains_number, maj_number)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            pbar.set_postfix({'Loss': epoch_loss / (batch_idx + 1)})
            pbar.update(1)

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / num_batches}')

# Evaluate the model on the test data
model.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0
all_test_preds = []
all_test_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        words = batch['word']
        pos = batch['pos']
        lemma = batch['lemma']
        number = batch['number']
        contains_number = batch['contains_number']
        maj_number = batch['maj_number']
        tags = batch['tag']

        outputs = model(words, pos, lemma, number, contains_number, maj_number)
        loss = criterion(outputs.view(-1, outputs.shape[-1]), tags.view(-1))
        test_loss += loss.item()

        _, predicted = torch.max(outputs.data, -1)
        correct_predictions += (predicted == tags).sum().item()
        total_predictions += tags.numel()

        all_test_preds.extend(predicted.cpu().numpy().flatten())
        all_test_labels.extend(tags.cpu().numpy().flatten())

avg_test_loss = test_loss / len(test_dataloader)
test_accuracy = correct_predictions / total_predictions

test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(all_test_labels, all_test_preds, average='weighted', zero_division=0)

print(f'Test Loss: {avg_test_loss}, Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}')

Epoch 1/10:  11%|█▏        | 599/5278 [12:55<1:35:48,  1.23s/batch, Loss=0.394]