#### Applications of pretrained models in the <span style="color:blue"> Hugging Face Transformers</span> library for sentiment classification. 

In [None]:
from datasets import load_dataset
from transformers import pipeline, BertForSequenceClassification, BertTokenizer
import torch
import pandas as pd

**Loading the [yelp_review_full](https://huggingface.co/datasets/yelp_review_full) dataset**

In [None]:
yelp_df = load_dataset("yelp_review_full")

**Selecting and inspecting data**

In [None]:
print(yelp_df)

In [None]:
print(type(yelp_df))

In [None]:
print(yelp_df["train"])

In [None]:
print(type(yelp_df["train"]))

In [None]:
print(yelp_df["train"][0])

In [None]:
print(yelp_df.num_columns, yelp_df.num_rows)

In [None]:
print(yelp_df.shape)

In [None]:
print(yelp_df.unique("label"))

**Transforming to other formats**

In [None]:
yelp_df.set_format(type="pandas")

In [None]:
print(yelp_df["train"])

In [None]:
print(type(yelp_df["test"][:]))

In [None]:
classifier = pipeline(task="text-classification", max_length=512)
sentiments = classifier(yelp_df["test"]["text"][0])

In [None]:
pd.DataFrame(sentiments)

In [None]:
train_df = yelp_df["test"][:10]
train_df

In [None]:
print(train_df["text"])

In [None]:
def sentiment_classification(text):
    outputs = classifier(text)
    label = outputs[0]["label"]
    return label


train_df["sentiment"] = train_df["text"].map(sentiment_classification)

In [None]:
train_df

**Tokenization**

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
print(train_df["text"][5])
print(tokenizer.encode(train_df["text"][5]))
example_tokens = tokenizer.encode(
    train_df["text"][5], truncation=True, padding=True, add_special_tokens=True
)
print(tokenizer.decode(example_tokens))

In [None]:
input_ids = torch.tensor(example_tokens).unsqueeze(0)
print(input_ids)

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
train_tokens = [
    tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    for text in train_df["text"]
]
ids = [torch.tensor(tokens).unsqueeze(0) for tokens in train_tokens]
print(ids)

In [None]:
outputs = [model(input_ids=id) for id in ids]
predicted_labels = [torch.argmax(output.logits, dim=1) for output in outputs]
token_labels = [predicted_label[0].tolist() for predicted_label in predicted_labels]
train_df["new_label"] = pd.Series(token_labels)
train_df