# Reuse trained models

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForSeq2SeqLM, AutoConfig, pipeline
import pandas as pd
import re
import string
import torch
from sklearn import set_config

## RoBERTa Model

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

Documentation RoBERTa m: https://huggingface.co/docs/transformers/main/en/model_doc/roberta

In [3]:
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
roberta_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [4]:
roberta_model.config

RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment-latest",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Negative",
    "1": "Neutral",
    "2": "Positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

**Try the model with some data, to sentiment prediction:**

In [8]:
X_pred = pd.read_csv("../raw_data/tweets.csv")
X_pred.rename(columns = {'2022-05-31 15:38:40.948508':'date', "@Apple Pricey. That's iPhone.": "text" }, inplace = True)
X_pred.head()

Unnamed: 0,date,text
0,2022-05-31 15:38:40.948508,No there is nothing you can immutably claim at...
1,2022-05-31 15:38:40.948508,@Brueck1988 @Apple Might need to call for a tu...
2,2022-05-31 15:38:40.948508,Guys I have a question. If @Apple states that ...
3,2022-05-31 15:38:40.948508,@Apple is also happy to give access to your ac...
4,2022-05-31 15:38:40.948508,"//End of Thread// If you enjoyed this story, ..."


In [9]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation,
    remove words containing numbers, remove numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = ''.join([i for i in text if not i.isdigit()])
    return text

In [10]:
X_pred["text"] = X_pred["text"].apply(clean_text)
X_pred.head()

Unnamed: 0,date,text
0,2022-05-31 15:38:40.948508,no there is nothing you can immutably claim at...
1,2022-05-31 15:38:40.948508,apple might need to call for a tutorial on th...
2,2022-05-31 15:38:40.948508,guys i have a question if apple states that th...
3,2022-05-31 15:38:40.948508,apple is also happy to give access to your acc...
4,2022-05-31 15:38:40.948508,end of thread if you enjoyed this story make ...


In [13]:
ls_to_pred = X_pred["text"].tolist()
ls_to_pred

['no there is nothing you can immutably claim at apple you are permanently at the visitor center nothing else johnny ives told you you have your  prostitute you are immutably done thanks',
 ' apple might need to call for a tutorial on that laterbout to lose my damn mind',
 'guys i have a question if apple states that the reason why they are not including any charging bricks on their iphones but for ipads they are including one just because its so expensive so where is saving the environment in that anyone thank you',
 'apple is also happy to give access to your accounts for years i can prove for example that as you are typing into the notes app pages app and many others that the govt can watch you as you do it in real time it is more than unencrypted icloud backups more on this soon…',
 'end of thread  if you enjoyed this story make sure to rt and follow mbitpodcast lets see if we can get this in the hands of ericsyuan 🚀  cc zoom webex cisco microsoft apple emergencecap yahoobusiness b

In [14]:
inputs = roberta_tokenizer(ls_to_pred[0], return_tensors="pt")

with torch.no_grad():
    logits = roberta_model(**inputs).logits

predicted_class_id = logits.argmax().item()
roberta_model.config.id2label[predicted_class_id]

'Neutral'

In [None]:
create_token_type_ids_from_seque

In [18]:
padded_sequences_inputs = roberta_tokenizer(ls_to_pred, padding=True, return_tensors="pt")
padded_sequences_inputs

{'input_ids': tensor([[    0,  2362,    89,  ...,     1,     1,     1],
        [    0, 15162,   429,  ...,     1,     1,     1],
        [    0,  5521,  2459,  ...,     1,     1,     1],
        ...,
        [    0, 12864, 45701,  ...,     1,     1,     1],
        [    0, 25800,   630,  ...,     1,     1,     1],
        [    0,   119,  4348,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [19]:
with torch.no_grad():
    logits = roberta_model(**padded_sequences_inputs).logits

predicted_class_id = logits.argmax().item()
roberta_model.config.id2label[predicted_class_id]

KeyError: 3191