**Caso de uso: Como mejorar con embeddings - PARTE 4**

In [1]:
import torch
import pandas as pd
from torch import nn
from transformers import BartTokenizer, BartModel
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
bart_model = BartModel.from_pretrained("facebook/bart-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [3]:
df = pd.read_csv("/content/drive/My Drive/spam/data/SMSSpamCollection",
                 sep="\t",
                 names=["type", "message"])
df["spam"] = df["type"] == "spam"
df.drop("type", axis=1, inplace=True)
print(df.head())
print(df.shape)

                                             message   spam
0  Go until jurong point, crazy.. Available only ...  False
1                      Ok lar... Joking wif u oni...  False
2  Free entry in 2 a wkly comp to win FA Cup fina...   True
3  U dun say so early hor... U c already then say...  False
4  Nah I don't think he goes to usf, he lives aro...  False
(5572, 2)


In [4]:
df_train = df.sample(frac=0.8, random_state=0)
print(df_train.shape)
df_val = df.drop(index=df_train.index)
print(df_val.shape)

def convert_to_embeddings(messages):
  embeddings_list = []
  for message in tqdm(messages):
    out = tokenizer([message],
                    padding=True,
                    max_length=512,
                    truncation=True,
                    return_tensors="pt"
                    )
    with torch.no_grad():
      bart_model.eval()

      pred = bart_model(
          input_ids=out["input_ids"],
          attention_mask=out["attention_mask"]
          )
      embeddings = pred.last_hidden_state.mean(dim=1).reshape((-1))
      embeddings_list.append(embeddings)

  return torch.stack(embeddings_list)


(4458, 2)
(1114, 2)


In [5]:

X_train = convert_to_embeddings(df_train["message"].tolist())
X_val = convert_to_embeddings(df_val["message"].tolist())



100%|██████████| 4458/4458 [13:50<00:00,  5.36it/s]
100%|██████████| 1114/1114 [03:37<00:00,  5.13it/s]


In [6]:
y_train = torch.tensor(df_train["spam"].values, dtype=torch.float32).reshape((-1, 1))
y_val = torch.tensor(df_val["spam"].values, dtype=torch.float32).reshape((-1, 1))

In [None]:
model_hidden = nn.Linear(768, 2)
model_output = nn.Linear(2, 1)
loss_fn = torch.nn.BCEWithLogitsLoss()
parameters = list(model_hidden.parameters()) + list(model_output.parameters())
optimizer = torch.optim.SGD(parameters, lr=0.02)

In [None]:
for i in range(0, 10000):
    # Training pass
    optimizer.zero_grad()
    outputs = model_hidden(X_train)
    outputs = nn.functional.sigmoid(outputs)
    outputs = model_output(outputs)
    print(outputs.shape)
    print(y_train.shape)
    loss = loss_fn(outputs, y_train)
    loss.backward()
    optimizer.step()

    if i % 1000 == 0:
        print(loss)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size([4458, 1])
torch.Size(

In [None]:
def evaluate_model(X, y):
    model_hidden.eval()
    model_output.eval()
    with torch.no_grad():
        outputs = model_hidden(X)
        outputs = nn.functional.sigmoid(outputs)
        outputs = model_output(outputs)

        y_pred = nn.functional.sigmoid(outputs) > 0.25

        print("accuracy:", (y_pred == y)\
            .type(torch.float32).mean())

        print("sensitivity:", (y_pred[y == 1] == y[y == 1])\
            .type(torch.float32).mean())

        print("specificity:", (y_pred[y == 0] == y[y == 0])\
            .type(torch.float32).mean())

        print("precision:", (y_pred[y_pred == 1] == y[y_pred == 1])\
            .type(torch.float32).mean())

print("Evaluating on the training data")
evaluate_model(X_train, y_train)

print("Evaluating on the validation data")
evaluate_model(X_val, y_val)

Evaluating on the training data
accuracy: tensor(0.9973)
sensitivity: tensor(0.9951)
specificity: tensor(0.9977)
precision: tensor(0.9853)
Evaluating on the validation data
accuracy: tensor(0.9946)
sensitivity: tensor(0.9856)
specificity: tensor(0.9959)
precision: tensor(0.9716)


In [None]:
X_custom = convert_to_embeddings([
    "We have released a new feature for your product and you have been selected to try it!",
    "We have released a new product to improve your sales, do you want to try it",
    "Winner! Great deal, call us to get this product for free",
    "Tomorrow is my birthday, do you come to the party?"
])
model_hidden.eval()
model_output.eval()
with torch.no_grad():
    o = model_hidden(X_custom)
    o = nn.functional.sigmoid(o)
    o = model_output(o)
    pred = nn.functional.sigmoid(o)
    print(pred)

100%|██████████| 4/4 [00:00<00:00,  5.69it/s]

tensor([[0.8258],
        [0.4447],
        [0.8802],
        [0.0062]])



