# TXAI - Final Project

**Students:** Victor Barreiro - Maximiliano Hormazábal

## Install dependencies

*Install requirements.txt*

## Import Dependencies

In [1]:
#Import own functions
from txai_utils import *

# Libraries
import time
import pandas as pd
import numpy as np
import transformers
import shap
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset, Value, Features, ClassLabel, load_metric
import torch
import torch.nn as nn

# To train an unbalanced model we have to overwrite the trainer class and create
# a custom trainer that takes into account the unbalance (ratio between the classes)

class CustomTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        weights = torch.tensor(cw, dtype=torch.float32).to('cpu:0')
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

## False Claims Model

### Training a Distilbert-Base-Uncased

In [3]:
csv_path = './data/fact_check_bool.csv'
df = pd.read_csv(csv_path, sep=",")

# Set the name of the file containing the dataset to be used
data = csv_path
text_column = 'claims'
target_column = 'veracity'
pretrained_model = 'distilbert-base-uncased'

cw = {k:(max(dict(df[target_column].value_counts()).values())/v) for k,v in dict(df[target_column].value_counts()).items()}

counts = [dict(df[target_column].value_counts())[0], dict(df[target_column].value_counts())[1]]
n_samples = counts[0] + counts[1]
n_classes = len(counts)
class_weights = [n_samples / (n_classes * n_samplesj) for n_samplesj in counts]
cw=class_weights

# Load the tokenizer to be used for processing the text data
tokenizer = AutoTokenizer.from_pretrained(pretrained_model) 

# Define the features of the dataset
features = Features({text_column: Value('string'),
                     target_column: ClassLabel(names=['0','1'])})

# Load the dataset from the CSV file using the Hugging Face library
dataset_raw = load_dataset('csv', data_files=data, delimiter=',', features=features)["train"]

# Shuffle the dataset randomly and split it into training and testing sets
dataset = dataset_raw.shuffle(seed=42).train_test_split(test_size=0.15)

# Transform the dataset by mapping the 'veracity' column to a new column called 'labels'
# The 'veracity' column is then removed from the dataset
dataset = dataset.map(lambda examples: {'labels': examples[target_column]}, remove_columns=[target_column])

# Tokenize the text data in the dataset using the tokenizer loaded earlier
# The tokenized dataset is created by mapping the original dataset to a new dataset, with the text data being tokenized in the process
# The `batched=True` argument indicates that the dataset should be processed in batches to improve performance
tokenized_dataset = dataset.map(lambda x: tokenize_function(x,tokenizer=tokenizer,colname=text_column), batched=True)

# Create a data collator, which is used to prepare the input data for training the model
# The data collator is created using the tokenizer loaded earlier
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the pre-trained DistilBERT model for sequence classification
# The `num_labels=2` argument specifies that the model should be trained to classify the text data into two labels: true (label '1') or false (label '0')
model_bool = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=2)

# Set the training arguments for the model
training_args = TrainingArguments(
    output_dir="./models/claims", 
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5
)

start_time = time.time()
trainer_bool = CustomTrainer(
    model_bool,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
eval = trainer_bool.evaluate()
train = trainer_bool.train()
end_time = time.time()
print('----------------------------------------------------------------------')
print('Training process suceddfully ended in',end_time - start_time,'seconds')
print('----------------------------------------------------------------------')

# Save the model as file
torch.save(model_bool,'models/claims/model_claims.pt')

print('----------------------------------------------------------------------')
print('Model Saved')
print('----------------------------------------------------------------------')

Found cached dataset csv (/Users/maxhormazabal/.cache/huggingface/datasets/csv/default-15a757d4b437eefc/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /Users/maxhormazabal/.cache/huggingface/datasets/csv/default-15a757d4b437eefc/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-3ebdbf1548ea0df4.arrow


Map:   0%|          | 0/748 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

  0%|          | 0/17 [00:00<?, ?it/s]

load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
Trainer is attempting to log a value of "[0.94023904 0.        ]" of type <class 'numpy.ndarray'> for key "eval/f1c" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: claims. If claims are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 748
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 470
  Number of trainable parameters = 66955010


  0%|          | 0/470 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: claims. If claims are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 133
  Batch size = 8


  0%|          | 0/17 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.93061224 0.19047619]" of type <class 'numpy.ndarray'> for key "eval/f1c" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.6656067371368408, 'eval_accuracy': 0.8721804511278195, 'eval_f1c': array([0.93061224, 0.19047619]), 'eval_runtime': 2.901, 'eval_samples_per_second': 45.846, 'eval_steps_per_second': 5.86, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: claims. If claims are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 133
  Batch size = 8


  0%|          | 0/17 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.92050209 0.2962963 ]" of type <class 'numpy.ndarray'> for key "eval/f1c" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.754928708076477, 'eval_accuracy': 0.8571428571428571, 'eval_f1c': array([0.92050209, 0.2962963 ]), 'eval_runtime': 2.7543, 'eval_samples_per_second': 48.287, 'eval_steps_per_second': 6.172, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: claims. If claims are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 133
  Batch size = 8


  0%|          | 0/17 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.925      0.30769231]" of type <class 'numpy.ndarray'> for key "eval/f1c" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 1.0959995985031128, 'eval_accuracy': 0.8646616541353384, 'eval_f1c': array([0.925     , 0.30769231]), 'eval_runtime': 2.6999, 'eval_samples_per_second': 49.261, 'eval_steps_per_second': 6.297, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: claims. If claims are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 133
  Batch size = 8


  0%|          | 0/17 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.93004115 0.26086957]" of type <class 'numpy.ndarray'> for key "eval/f1c" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 1.2874237298965454, 'eval_accuracy': 0.8721804511278195, 'eval_f1c': array([0.93004115, 0.26086957]), 'eval_runtime': 2.6826, 'eval_samples_per_second': 49.58, 'eval_steps_per_second': 6.337, 'epoch': 4.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: claims. If claims are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 133
  Batch size = 8


  0%|          | 0/17 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.92561983 0.25      ]" of type <class 'numpy.ndarray'> for key "eval/f1c" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 1.3117274045944214, 'eval_accuracy': 0.8646616541353384, 'eval_f1c': array([0.92561983, 0.25      ]), 'eval_runtime': 3.0409, 'eval_samples_per_second': 43.737, 'eval_steps_per_second': 5.59, 'epoch': 5.0}
{'train_runtime': 179.4259, 'train_samples_per_second': 20.844, 'train_steps_per_second': 2.619, 'train_loss': 0.4367903689120678, 'epoch': 5.0}
----------------------------------------------------------------------
Training process suceddfully ended in 182.7359311580658 seconds
----------------------------------------------------------------------
----------------------------------------------------------------------
Model Saved
----------------------------------------------------------------------


In [6]:
# Generate Explainer from model
model_path = './models/claims/model_claims.pt'
explainer = generateExplainer(model_path,'distilbert-base-uncased')

loading configuration file config.json from cache at /Users/maxhormazabal/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/maxhormazabal/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /Users/maxhormazabal/.cache/huggingfa

In [7]:
# Create the visualization of the explainer in the particular instance
text = "Gov. Kim Reynolds, touting $210 million for Iowa broadband, “failed to mention these are actually federal funds approved by Rep. Cindy Axne and signed into law by President Joe Biden."
visualizeExplanation(explainer,text)

Disabling tokenizer parallelism, we're using DataLoader multithreading already


  0%|          | 0/498 [00:00<?, ?it/s]

Partition explainer: 2it [00:11, 11.36s/it]               


## London Hotels Feedback

In [None]:
csv_path = './data/trip_advisor_data.csv'
df = pd.read_csv(csv_path, sep=",")

# Set the name of the file containing the dataset to be used
data = csv_path
text_column = 'review_coment'
target_column = 'user_rating'
pretrained_model = 'dccuchile/bert-base-spanish-wwm-uncased'

cw = {k:(max(dict(df[target_column].value_counts()).values())/v) for k,v in dict(df[target_column].value_counts()).items()}

counts = [dict(df[target_column].value_counts())[0], dict(df[target_column].value_counts())[1]]
n_samples = counts[0] + counts[1]
n_classes = len(counts)
class_weights = [n_samples / (n_classes * n_samplesj) for n_samplesj in counts]
cw=class_weights

# Load the tokenizer to be used for processing the text data
tokenizer = AutoTokenizer.from_pretrained(pretrained_model) 

# Define the features of the dataset
features = Features({text_column: Value('string'),
                     target_column: ClassLabel(names=['0','1'])})

# Load the dataset from the CSV file using the Hugging Face library
dataset_raw = load_dataset('csv', data_files=data, delimiter=',', features=features)["train"]

# Shuffle the dataset randomly and split it into training and testing sets
dataset = dataset_raw.shuffle(seed=42).train_test_split(test_size=0.15)

# Transform the dataset by mapping the 'user_rating' column to a new column called 'labels'
# The 'user_rating' column is then removed from the dataset
dataset = dataset.map(lambda examples: {'labels': examples[target_column]}, remove_columns=[target_column])

# Tokenize the text data in the dataset using the tokenizer loaded earlier
# The tokenized dataset is created by mapping the original dataset to a new dataset, with the text data being tokenized in the process
# The `batched=True` argument indicates that the dataset should be processed in batches to improve performance
tokenized_dataset = dataset.map(lambda x: tokenize_function(x,tokenizer=tokenizer,colname=text_column), batched=True)

# Create a data collator, which is used to prepare the input data for training the model
# The data collator is created using the tokenizer loaded earlier
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the pre-trained DistilBERT model for sequence classification
# The `num_labels=2` argument specifies that the model should be trained to classify the text data into two labels: true (label '1') or false (label '0')
model_bool = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=2)

# Set the training arguments for the model
training_args = TrainingArguments(
    output_dir="./models/tripadvisor", 
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=5
)

start_time = time.time()
trainer_bool = CustomTrainer(
    model_bool,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
eval = trainer_bool.evaluate()
train = trainer_bool.train()
end_time = time.time()
print('----------------------------------------------------------------------')
print('Training process suceddfully ended in',end_time - start_time,'seconds')
print('----------------------------------------------------------------------')

# Save the model as file
torch.save(model_bool,'models/tripadvisor/model_trip.pt')

print('----------------------------------------------------------------------')
print('Model Saved')
print('----------------------------------------------------------------------')

In [None]:
# Generate Explainer from model
model_path = './models/tripadvisor/model_trip.pt'
explainer = generateExplainer(model_path,'dccuchile/bert-base-spanish-wwm-uncased')

In [None]:
# Create the visualization of the explainer in the particular instance
text = "Horrible experencia en este hotel, basicamente el peor verano de mi vida. Definitivamente no volveré jamás"
visualizeExplanation(explainer,text)