* This Notebook describes how to use the trained model to check which descriptions are accessible and which are not.

* At the end it computes the overall percentage of which descriptions are accessible and which not.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Upload in Google Colab the files that were saved in the training of the model
# and save them inside a new folder, so that you can use its path

model_dir = "path to the folder"

# Load tokenizer and model from local directory
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(model_dir, local_files_only=True)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print(f"Model loaded and using device: {device}")

In [None]:
# Prediction of some examples for testing

text_list = [
    "This is an example.",
    "Another input sentence goes here.",
    "Example of something else."
]
max_length = 64
batch_size = 32

results = []
for i in range(0, len(text_list), batch_size):
    batch_texts = text_list[i:i+batch_size]
    encoded = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        outputs = model(**encoded)
        probs = torch.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)

    for j, text in enumerate(batch_texts):
        results.append({
            "text": text,
            "predicted_label": preds[j].item(),
            "prob_class_0": round(probs[j][0].item(), 4),
            "prob_class_1": round(probs[j][1].item(), 4)
        })

import pandas as pd
result_df = pd.DataFrame(results)
print(result_df)


In [None]:
# Prediction of a CSV file with data

import pandas as pd

csv_path = "/content/14. Personal-Accounts-Results.csv"
max_length = 64
batch_size = 32

df = pd.read_csv(csv_path)
if "description" not in df.columns:
    raise ValueError("CSV file must contain a 'description' column.")
texts = df["description"].astype(str).tolist()

results = []
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    encoded = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        outputs = model(**encoded)
        probs = torch.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)

    results.extend(preds.cpu().tolist())

# Compute statistics

total = len(results)
accessible = results.count(1)
inaccessible = results.count(0)

print(f"Total descriptions: {total}")
print(f"Accessible (label 1): {accessible} ({accessible / total:.2%})")
print(f"Inaccessible (label 0): {inaccessible} ({inaccessible / total:.2%})")
