In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [2]:
# Sample dataset with descriptions and issues (expand this with more data as needed)
data = {
    'issue': [
        'printer error', 'printer error', 'printer error',
        'network failure', 'network failure', 'network failure',
        'authentication issue', 'authentication issue', 'authentication issue',
        'software crash', 'software crash', 'software crash',
        'hardware failure', 'hardware failure', 'hardware failure'
    ],
    'description': [
        'Printer is not responding and unable to print',
        'The printer is printing incorrectly and skipping pages',
        'Printer shows error message and stops working',
        
        'Unable to connect to the network, no internet access',
        'Internet connection is very slow and dropping frequently',
        'Cannot connect to Wi-Fi, keeps disconnecting',

        'Cannot log in to the account, shows invalid credentials',
        'Authentication fails and password is incorrect',
        'Login issues with system, authentication error',

        'The software crashes and exits unexpectedly',
        'Application becomes unresponsive and shuts down',
        'Software closes suddenly with error message',

        'The computer hardware is malfunctioning and not responding',
        'The device hardware stops working, unresponsive keys',
        'Hardware failure detected, system is not working'
    ]
}

In [3]:
# Convert the data to a DataFrame
df = pd.DataFrame(data)

In [4]:
# Step 1: Encode the labels (issue categories)
label_encoder = LabelEncoder()
df['issue_encoded'] = label_encoder.fit_transform(df['issue'])

In [5]:
# Step 2: Train/Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['description'].tolist(), df['issue_encoded'].tolist(), test_size=0.2, random_state=42
)

In [6]:
# Step 3: Load Pre-trained BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")

In [8]:
# Step 4: Convert the tokenized data into a Dataset object
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': torch.tensor(train_labels)
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': torch.tensor(val_labels)
})

In [9]:
# Step 5: Load Pre-trained BERT Model for Sequence Classification
num_classes = len(df['issue_encoded'].unique())  # Number of unique labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Step 6: Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate every epoch
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)



In [11]:
# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [12]:
# Step 8: Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.63804
2,No log,1.609373
3,No log,1.625507


TrainOutput(global_step=6, training_loss=1.5876228014628093, metrics={'train_runtime': 19.0239, 'train_samples_per_second': 1.892, 'train_steps_per_second': 0.315, 'total_flos': 240506427096.0, 'train_loss': 1.5876228014628093, 'epoch': 3.0})

In [13]:
# Step 9: Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 1.6255072355270386, 'eval_runtime': 0.4044, 'eval_samples_per_second': 7.418, 'eval_steps_per_second': 2.473, 'epoch': 3.0}


In [14]:
# Step 10: Inference - Test the model with new inputs
def predict_issue(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")
    
    # Get the model predictions
    outputs = model(**inputs)
    
    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Get the predicted class
    predicted_class = torch.argmax(probs, dim=1).item()
    
    # Get the predicted issue label
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    
    # Print the predicted issue and probabilities
    print(f"Predicted issue: {predicted_label}")
    print(f"Probabilities: {probs.detach().numpy()}")

In [15]:
# Test the model with a sample input
sample_text = "The printer is not working and showing an error"
predict_issue(sample_text)

Predicted issue: printer error
Probabilities: [[0.20034431 0.18646201 0.21011232 0.22858734 0.17449398]]


In [16]:
# Test the model with a sample input
sample_text = "User is unable to submit the print to printer"
predict_issue(sample_text)

Predicted issue: printer error
Probabilities: [[0.21846445 0.17536743 0.206685   0.22315246 0.1763306 ]]


In [17]:
# Test the model with a sample input
sample_text = "User is unable to submit the print from computer"
predict_issue(sample_text)

Predicted issue: printer error
Probabilities: [[0.22299306 0.17284948 0.20833552 0.22482087 0.17100108]]


In [18]:
# Test the model with a sample input
sample_text = "unable to access print services"
predict_issue(sample_text)

Predicted issue: network failure
Probabilities: [[0.22994022 0.17139538 0.23454618 0.20097023 0.16314799]]
