In [1]:
!pip install transformers datasets scikit-learn



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments


2024-07-05 09:51:14.140838: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 09:51:14.141006: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 09:51:14.280892: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Load and preprocess the dataset
df = pd.read_csv("/kaggle/input/ticket/data.csv")
df = df.drop(columns=['Unnamed: 0'])
df['sentence'] = df['clean_subject'] + ' ' + df['clean_description']
df = df[df['ticket_type'] != 'Problem']  # Drop Class Problem
df.dropna(subset=['sentence'], inplace=True)
df = df[['sentence', 'ticket_type']]

In [4]:
# Encode the labels
label2id = {label: i for i, label in enumerate(df['ticket_type'].unique())}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['ticket_type'].map(label2id)

In [5]:
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [10]:
# Load the model and tokenizer
model_name = "papluca/xlm-roberta-base-language-detection"
num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at papluca/xlm-roberta-base-language-detection and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([20]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([20, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/176680 [00:00<?, ? examples/s]

Map:   0%|          | 0/44171 [00:00<?, ? examples/s]



In [11]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

03e121b5249ddc41bae22e9ba18fbb65bed41b1e

In [None]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


In [None]:
# Save the model
model.save_pretrained("./xlm-model")
tokenizer.save_pretrained("./xlm-model")

## **model performance**

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Function to compute metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Evaluate the model
results = trainer.evaluate()

print("Evaluation results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

# Compute detailed metrics
test_preds = trainer.predict(tokenized_datasets["test"])
metrics = compute_metrics(test_preds)

print("\nDetailed metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")


## **model interpretability**

In [None]:
from transformers import pipeline
from captum.attr import IntegratedGradients
import torch

# Load the fine-tuned model and tokenizer
model_path = "./xlm-model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a pipeline for text classification
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Function to interpret model predictions
def interpret_text(text):
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    def forward_pass(inputs):
        outputs = model(**inputs)
        return torch.softmax(outputs.logits, dim=1)

    integrated_gradients = IntegratedGradients(forward_pass)
    attributions, delta = integrated_gradients.attribute(inputs['input_ids'], return_convergence_delta=True)
    attributions_sum = attributions.sum(dim=2).squeeze(0)
    
    # Tokenize input text
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())
    
    # Get prediction
    prediction = nlp(text)
    
    return tokens, attributions_sum, prediction

# Example usage
example_text = "Your example sentence here"
tokens, attributions, prediction = interpret_text(example_text)

print(f"Text: {example_text}")
print(f"Prediction: {prediction}")
print("\nToken attributions:")
for token, attribution in zip(tokens, attributions):
    print(f"{token}: {attribution.item():.4f}")
