In [1]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

fb_bert_large_mnli = pipeline("zero-shot-classification",
                              model="facebook/bart-large-mnli")

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

deberta_tokenizer = AutoTokenizer.from_pretrained('s-nlp/deberta-large-formality-ranker')
deberta_model = AutoModelForSequenceClassification.from_pretrained('s-nlp/deberta-large-formality-ranker')
deberta_model.to(device)
deberta_model.eval()

roberta_tokenizer = AutoTokenizer.from_pretrained('s-nlp/roberta-base-formality-ranker')
roberta_model = AutoModelForSequenceClassification.from_pretrained('s-nlp/roberta-base-formality-ranker')
roberta_model.to(device)
roberta_model.eval()



config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use mps:0


tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [2]:
from datasets import load_dataset

data = load_dataset("oishooo/formality_classification", split="train").to_pandas()

README.md:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


formality_dataset.csv:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2523 [00:00<?, ? examples/s]

In [3]:
from tqdm import tqdm

tqdm.pandas()

data["fb_class"] = data["text"].progress_apply(
    lambda x: fb_bert_large_mnli(x, ["informal", "neutral", "formal"])["labels"][0])

  1%|          | 21/2523 [00:14<29:09,  1.43it/s] 


KeyboardInterrupt: 

In [14]:
def process_in_batches(model, tokenizer, dataframe, batch_size=32):
    all_scores = []

    for start_idx in tqdm(range(0, len(dataframe), batch_size)):
        end_idx = min(start_idx + batch_size, len(dataframe))
        batch = dataframe.iloc[start_idx:end_idx]

        texts = batch["text"].tolist()
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        batch_scores = outputs["logits"].softmax(dim=-1)[:, 0].cpu().numpy()
        all_scores.extend(batch_scores.tolist())

    return all_scores

In [31]:
import pandas as pd

def classify_formality_score_deberta(score):
    """An observation: while looking at the classification scores, I noticed that the "formal" label is very prevalent for this model (more than 70%), while in reality formal samples make up less 40% in the dataset. All the samples that were assigned "formal" by the model did so with 0.98-0.99 certainty. 
    """
    if score >= 0.7:
        return "formal"
    elif score < 0.3:
        return "informal"
    else:
        return "neutral"
deberta_scores = process_in_batches(deberta_model, deberta_tokenizer, data, 32)
data["deberta_class"] = pd.Series(deberta_scores).apply(classify_formality_score_deberta)

In [15]:
def classify_formality_score_roberta(score):
    """This may seem weird, since the if-else in this case is the opposite of deberta, but only this setting makes sense in the context of the results. If set up as in deberta, the results are unbelievably low and confusion matrix shows the mirror of the
    current confusion matrix (so it consistently classifies formal as informal and vice versa), meaning that even though both models are from the same publisher, the meaning of logits is switched, and it's not specified anywhere in the documentation.
    """
    if score < 0.3:
        return "formal"
    elif score >= 0.7:
        return "informal"
    else:
        return "neutral"

roberta_scores = process_in_batches(roberta_model, roberta_tokenizer, data, 32)
data["roberta_class"] = pd.Series(roberta_scores).apply(classify_formality_score_roberta)

In [9]:
data.to_csv("with_predictions.csv", index=False)