In [1]:
import requests
from transformers import pipeline
from huggingface_hub import configure_http_backend
import os
from huggingface_hub import login
from transformers import AutoTokenizer
import warnings
from dotenv import load_dotenv
import tensorflow as tf

In [2]:
# Disable SSL warnings (optional but not recommended in production)
warnings.filterwarnings("ignore")

def backend_factory() -> requests.Session:
    session = requests.Session()
    session.verify = False
    return session

configure_http_backend(backend_factory=backend_factory)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

tf.get_logger().setLevel('ERROR')

In [3]:
# GET NEW FROM SCRAPPING

In [4]:
new = (
"La amenaza de aranceles de Trump al vino europeo desata la alarma en el sector"
"Guerra comercial"
"EE.UU. amaga con tasas del 200%; Catalunya y Andalucía, entre las grandes perjudicadas"
)

In [5]:
# SENTIMENT ANALYSIS

In [6]:

hf_token = os.getenv('TOKEN_HF')

# configure classification to classify sentiment
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# execute classification
sentiment_analysis_result = classifier(new)

print("sentiment analysis: ",sentiment_analysis_result)

Device set to use cpu


sentiment analysis:  [{'label': 'NEGATIVE', 'score': 0.9005516767501831}]


In [7]:
# TOKENIZATION

In [8]:
# Load phi3 tokenization model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# phrase to tokenize
texto = new

# tokenize the text and get input ids
tokens = tokenizer(texto)['input_ids']
print("token IDs: ",tokens)
print()

frase_original = tokenizer.decode(tokens)

# show original text
print("original sentence: "+frase_original)
print()

vocabulario = tokenizer.get_vocab()

# show vocabulary
print(f"Tokenizer Vocab: {len(vocabulario)}")
print("Some vocabulary examples:")
for palabra, indice in list(vocabulario.items())[:10]:
    print(f"{palabra}: {indice}")

token IDs:  [101, 2474, 2572, 8189, 4143, 2139, 19027, 5897, 4244, 2139, 8398, 2632, 19354, 2080, 2885, 2080, 4078, 6790, 2474, 8598, 2050, 4372, 3449, 4753, 9077, 11335, 2272, 11890, 4818, 4402, 1012, 1057, 2226, 1012, 25933, 3654, 9530, 11937, 20939, 3972, 3263, 1003, 1025, 4937, 2389, 4609, 3148, 1061, 1998, 2389, 14194, 2401, 1010, 4372, 7913, 5869, 9026, 2015, 2566, 9103, 14808, 8447, 2015, 102]

original sentence: [CLS] la amenaza de aranceles de trump al vino europeo desata la alarma en el sectorguerra comercialee. uu. amaga con tasas del 200 % ; catalunya y andalucia, entre las grandes perjudicadas [SEP]

Tokenizer Vocab: 30522
Some vocabulary examples:
compressed: 16620
kara: 13173
undertake: 16617
##ducted: 29510
##ded: 5732
##hc: 16257
joker: 19318
##head: 4974
krishna: 10871
neck: 3300
