In [66]:
# # List of libraries to check
# libraries = [
#     "collections", "nltk", "json", "numpy", "pandas", 
#     "sklearn.model_selection", "torch", "torch.nn", 
#     "torch.nn.functional", "torch.utils.data", "torchmetrics"
# ]

# # Function to check the availability of each library
# def check_libraries(libraries):
#     for lib in libraries:
#         try:
#             __import__(lib)
#             print(f"{lib} is installed.")
#         except ImportError:
#             print(f"{lib} is NOT installed.")

# check_libraries(libraries)


In [67]:
# import nltk
# from nltk.data import find

# try:
#     find('tokenizers/punkt')
#     print("Punkt tokenizer is already installed.")
# except LookupError:
#     print("Punkt tokenizer is not installed.")


In [68]:
# import json

# # Load the list from words.json
# with open('words.json') as f:
#     word_list = json.load(f)

# # Convert list to dictionary
# vocabulary = {word: idx for idx, word in enumerate(word_list)}

# # Save the dictionary back to words.json
# with open('words.json', 'w') as f:
#     json.dump(vocabulary, f, indent=4)

# print(f"Vocabulary type: {type(vocabulary)}")  # Should be <class 'dict'>


In [69]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import json
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_score, recall_score, accuracy_score

## Define the dataset class

In [70]:
class TicketDataset(Dataset):
    def __init__(self, text_file, label_file, word_file):
        with open(text_file, 'r') as f:
            self.texts = json.load(f)
        self.labels = np.load(label_file)
        with open(word_file, 'r') as f:
            self.word_to_idx = json.load(f)
        self.vocab_size = len(self.word_to_idx)
        self.max_length = max(len(text) for text in self.texts)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # Convert words to indices
        text_indices = [self.word_to_idx.get(word, 0) for word in text]
        # Pad or truncate text
        text_indices = text_indices[:self.max_length] + [0] * (self.max_length - len(text_indices))
        return torch.tensor(text_indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

## Load datasets

In [71]:
train_dataset = TicketDataset('text.json', 'labels.npy', 'words.json')
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TicketDataset('text.json', 'labels.npy', 'words.json')  
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Define the CNN model

In [72]:
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=50, num_filters=64, filter_size=3, num_classes=5):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_size)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(num_filters, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, max_length) -> (batch_size, max_length, embedding_dim)
        x = x.permute(0, 2, 1)  # (batch_size, max_length, embedding_dim) -> (batch_size, embedding_dim, max_length)
        x = self.conv1(x)  # (batch_size, embedding_dim, max_length) -> (batch_size, num_filters, new_length)
        x = self.pool(x)  # (batch_size, num_filters, new_length) -> (batch_size, num_filters, 1)
        x = x.squeeze(-1)  # (batch_size, num_filters, 1) -> (batch_size, num_filters)
        x = self.fc(x)  # (batch_size, num_filters) -> (batch_size, num_classes)
        return x

## Instantiate the model, criterion, and optimizer

In [73]:
vocab_size = train_dataset.vocab_size
num_classes = len(set(train_dataset.labels))  # Determine number of classes
model = CNNClassifier(vocab_size=vocab_size, num_classes=num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Train the model

In [74]:
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs} completed')

Epoch 1/3 completed
Epoch 2/3 completed
Epoch 3/3 completed


## Test the model

In [75]:
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

## Calculate metrics

In [76]:
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average=None)
recall = recall_score(true_labels, predictions, average=None)

In [77]:
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

# Save metrics
torch.save({'accuracy': accuracy, 'precision': precision.tolist(), 'recall': recall.tolist()}, 'metrics01.pth')

Accuracy: 0.8412
Precision: [0.82829504 0.74210077 0.8430622  0.88367347 0.92630502]
Recall: [0.685 0.869 0.881 0.866 0.905]
