# RoBERTa for sentence classification

### download pretrained files:

In [None]:
import json
import numpy as np

import mindspore as ms
import mindspore.ops as ops

from mindnlp.models.roberta import RobertaConfig
from mindnlp.transforms import RobertaTokenizer
from mindnlp.models import RobertaForSequenceClassification

from corpus_cleaning_kit import en_cleaning

In [None]:
with open('roberta_detector.json', 'r') as file:
    config = json.load(file)
    config = RobertaConfig(**config)

In [None]:
model = RobertaForSequenceClassification(config)
ms.load_checkpoint('roberta_detector.ckpt', model, strict_load=True)

In [None]:
tokenizer = RobertaTokenizer('roberta_vocab.json')
tokenizer._pad_token = 1

In [None]:
max_sequence_length = 512
seed = 0
rng = np.random.RandomState(seed)

def tokenize_truncate_pad(text: str):
    tokens = tokenizer.encode(text).ids
    output_length = min(len(tokens), max_sequence_length)
    start = 0 if len(tokens) <= output_length else rng(0, len(tokens) - output_length + 1)
    end = start + output_length
    tokens = tokens[start: end]

    padding = [tokenizer.pad_token_id] * (max_sequence_length - len(tokens))
    tokens = ms.Tensor(tokens + padding)
    mask = ms.ops.ones(tokens.shape[0], ms.int32)
    mask[-len(padding):] = 0

    return tokens[None, ...], mask[None, ...]

In [None]:
test_sentences = [
    "They are not colored . Just as white paint is usually made from minerals found in clay . The crystals in white paint reflects all light equally making it appear white . Just liek snow . Primarily the eye color is based on the density and distribution of melanin in the eye . It just looks a certain color when light illuminates the eye . It reflects light unqually ."
    "Piracy and copyright law can be contentious issues on the internet because they involve complex questions about how to balance the rights of creators and the interests of consumers. Some people argue that artists should have the right to control how their works are distributed and to charge what they feel is appropriate, while others believe that the free exchange of information is important and that artists should not be able to control how their works are used. It's important to remember that copyright law exists to protect the rights of creators and to encourage the creation of new works by ensuring that artists can earn a fair income from their creations. When someone pirates (unauthorized copying) or uses a copyrighted work without permission, they are taking something that belongs to someone else and using it for their own benefit, without paying the person who created it. This can be seen as unfair to the creator and can discourage them from creating new works in the future. At the same time, it's also important to recognize that not everyone has the same access to information and that copyright laws can sometimes make it difficult or impossible for people to access the works they want to use. This is why it's important to have a balance between protecting the rights of creators and ensuring that everyone has access to the information and works they need."
]
test_labels = [0, 1]
label_to_meaning = ['human', 'machine']

In [None]:
for text in test_sentences:
    text = en_cleaning(text)
    x, mask = tokenize_truncate_pad(text)
    logits = model(x, attention_mask=mask, labels=None)
    logits = logits[0] # remove tuple
    logits = ops.softmax(logits, axis=-1)[0] # remove batch
    result = (logits[1] > logits[0]).astype(ms.int32)
    print('result:', label_to_meaning[result])
    print('prob: %.2f' % logits[result])