In [2]:
import pandas as pd

In [6]:
from datasets import load_dataset

ds = load_dataset("osyvokon/pavlick-formality-scores")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.90k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/937k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/202k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9274 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
ds.data

{'train': MemoryMappedTable
 domain: string
 avg_score: double
 sentence: string
 ----
 domain: [["news","news","answers","news","news",...,"answers","answers","news","answers","answers"],["answers","answers","news","blog","answers",...,"answers","blog","answers","blog","answers"],...,["news","blog","answers","blog","answers",...,"answers","answers","answers","news","news"],["answers","answers","news","email","answers",...,"answers","answers","answers","answers","news"]]
 avg_score: [[-0.6,1,-2.8,0,1.8,...,-2.8,-1.8,1.2,0.2,-0.6],[-2,0.2,1.2,-1.4,-1.75,...,-2.4,1.2,-0.4,1.4,-2],...,[0.6,-0.4,-0.6,0.6,0.8,...,-2,-0.6,0.8,1,1.4],[0.4,-0.5,0.8,1.6,-1.6,...,-2.2,-1,-1.8,-3,2]]
 sentence: [["Tang was employed at private-equity firm Friedman Fleischer & Lowe.","San Francisco Mayor Gavin Newsom's withdrawal from the governor's race followed a meeting with top advisers where he was told that, unless he raised $5 million quickly and appeared to be viable, some of the state's biggest unions woul

Since we are not going to train anything, we don't need the split into train and test, so let's merge them, while also converting it into dataframe

In [8]:
df = pd.concat([pd.DataFrame(ds[split]) for split in ds], ignore_index=True)

In [9]:
df.head()

Unnamed: 0,domain,avg_score,sentence
0,news,-0.6,Tang was employed at private-equity firm Fried...
1,news,1.0,San Francisco Mayor Gavin Newsom's withdrawal ...
2,answers,-2.8,lol nothing worrying about that.
3,news,0.0,She told Price she wanted to join the Police E...
4,news,1.8,The prime minister is keen to use the autumn p...


Let's change the avg_score column to binary values to be able to use classification models. It makes sense to drop values that are around zero, since they are more neutral then belong to some class.

In [10]:
threshold = [-0.5, 0.5]

def change_to_binary(x):
    if threshold[0] <= x <= threshold[1]:
        return -1
    if x > threshold[1]:
        return 1
    return 0

In [11]:
df['label'] = df['avg_score'].apply(change_to_binary)

df = df[df['label'] != -1]

df.head()

Unnamed: 0,domain,avg_score,sentence,label
0,news,-0.6,Tang was employed at private-equity firm Fried...,0
1,news,1.0,San Francisco Mayor Gavin Newsom's withdrawal ...,1
2,answers,-2.8,lol nothing worrying about that.,0
4,news,1.8,The prime minister is keen to use the autumn p...,1
5,blog,1.0,Those competencies include mastering fundament...,1


In [50]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer_roberta = AutoTokenizer.from_pretrained("s-nlp/roberta-base-formality-ranker")
model_roberta = AutoModelForSequenceClassification.from_pretrained("s-nlp/roberta-base-formality-ranker")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_roberta.to(device)

size = 128

predictions_roberta = []
sentences = df["sentence"].tolist()
for i in range(0, len(sentences), size):
    inputs = tokenizer_roberta(sentences[i:min(len(sentences), i + size)],
                               padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model_roberta(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions_roberta += torch.argmax(probabilities, dim=1).cpu().numpy().tolist()

In [61]:
tokenizer_deberta = AutoTokenizer.from_pretrained("s-nlp/deberta-large-formality-ranker")
model_deberta = AutoModelForSequenceClassification.from_pretrained("s-nlp/deberta-large-formality-ranker")
model_deberta.to(device)

predictions_deberta = []
for i in range(0, len(sentences), size):
    inputs = tokenizer_deberta(sentences[i:min(len(sentences), i + size)],
                               padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model_deberta(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions_deberta += torch.argmax(probabilities, dim=1).cpu().numpy().tolist()

In [77]:
print(predictions_roberta[0:10])
print(predictions_deberta[0:10])

[0, 1, 0, 1, 1, 1, 0, 0, 1, 1]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


Seems like labels for DeBERTa are inverted, so lets change them to what we need

In [78]:
predictions_deberta = [1 - x for x in predictions_deberta]

In [82]:
from sklearn.metrics import f1_score, precision_score, recall_score

results = df["label"].tolist()

f1_roberta = f1_score(results, predictions_roberta)
precision_roberta = precision_score(results, predictions_roberta)
recall_roberta = recall_score(results, predictions_roberta)

print("RoBERTa base scores:")
print(f"Precision: {precision_roberta:.4f}")
print(f"Recall: {recall_roberta:.4f}")
print(f"F1 Score: {f1_roberta:.4f}")

f1_deberta = f1_score(results, predictions_deberta)
precision_deberta = precision_score(results, predictions_deberta)
recall_deberta = recall_score(results, predictions_deberta)

print("\nDeBERTa large scores:")
print(f"Precision: {precision_deberta:.4f}")
print(f"Recall: {recall_deberta:.4f}")
print(f"F1 Score: {f1_deberta:.4f}")

RoBERTa base scores:
Precision: 0.8342
Recall: 0.9699
F1 Score: 0.8969

DeBERTa large scores:
Precision: 0.7125
Recall: 0.9719
F1 Score: 0.8223


In [29]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,4379
0,3994
