#### Applications of pretrained models in the <span style="color:blue"> Hugging Face Transformers</span> library for sentiment classification. 

In [1]:
from datasets import load_dataset
from transformers import pipeline, BertForSequenceClassification, BertTokenizer
import torch
import pandas as pd

**Loading the [yelp_review_full](https://huggingface.co/datasets/yelp_review_full) dataset**

In [2]:
yelp_df = load_dataset("yelp_review_full")

Found cached dataset yelp_review_full (C:/Users/Dell/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

**Selecting and inspecting data**

In [3]:
print(yelp_df)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [4]:
print(type(yelp_df))

<class 'datasets.dataset_dict.DatasetDict'>


In [5]:
print(yelp_df["train"])

Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})


In [6]:
print(type(yelp_df["train"]))

<class 'datasets.arrow_dataset.Dataset'>


In [7]:
print(yelp_df["train"][0])

{'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}


In [8]:
print(yelp_df.num_columns, yelp_df.num_rows)

{'train': 2, 'test': 2} {'train': 650000, 'test': 50000}


In [9]:
print(yelp_df.shape)

{'train': (650000, 2), 'test': (50000, 2)}


In [10]:
print(yelp_df.unique("label"))

{'train': [4, 1, 3, 0, 2], 'test': [0, 2, 1, 3, 4]}


**Transforming to other formats**

In [11]:
yelp_df.set_format(type="pandas")

In [12]:
print(yelp_df["train"])

Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})


In [13]:
print(type(yelp_df["test"][:]))

<class 'pandas.core.frame.DataFrame'>


In [14]:
classifier = pipeline(task="text-classification", max_length=512)
sentiments = classifier(yelp_df["test"]["text"][0])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
pd.DataFrame(sentiments)

Unnamed: 0,label,score
0,NEGATIVE,0.998094


In [16]:
train_df = yelp_df["test"][:10]
train_df

Unnamed: 0,label,text
0,0,I got 'new' tires from them and within two wee...
1,0,Don't waste your time. We had two different p...
2,0,All I can say is the worst! We were the only 2...
3,0,I have been to this restaurant twice and was d...
4,0,Food was NOT GOOD at all! My husband & I ate h...
5,2,This is a tiny Starbucks and it locations like...
6,1,Typical Starbucks coffee chain. 2 things I don...
7,3,So.Much.Fun! \n\nI WISH I could play a song at...
8,3,"My friend is a piano teacher, so I took it as ..."
9,2,Stopped by on a Mon evening after trying to di...


In [17]:
print(train_df["text"])

0    I got 'new' tires from them and within two wee...
1    Don't waste your time.  We had two different p...
2    All I can say is the worst! We were the only 2...
3    I have been to this restaurant twice and was d...
4    Food was NOT GOOD at all! My husband & I ate h...
5    This is a tiny Starbucks and it locations like...
6    Typical Starbucks coffee chain. 2 things I don...
7    So.Much.Fun! \n\nI WISH I could play a song at...
8    My friend is a piano teacher, so I took it as ...
9    Stopped by on a Mon evening after trying to di...
Name: text, dtype: object


In [18]:
def sentiment_classification(text):
    outputs = classifier(text)
    label = outputs[0]["label"]
    return label


train_df["sentiment"] = train_df["text"].map(sentiment_classification)

In [19]:
train_df

Unnamed: 0,label,text,sentiment
0,0,I got 'new' tires from them and within two wee...,NEGATIVE
1,0,Don't waste your time. We had two different p...,NEGATIVE
2,0,All I can say is the worst! We were the only 2...,NEGATIVE
3,0,I have been to this restaurant twice and was d...,NEGATIVE
4,0,Food was NOT GOOD at all! My husband & I ate h...,NEGATIVE
5,2,This is a tiny Starbucks and it locations like...,POSITIVE
6,1,Typical Starbucks coffee chain. 2 things I don...,NEGATIVE
7,3,So.Much.Fun! \n\nI WISH I could play a song at...,POSITIVE
8,3,"My friend is a piano teacher, so I took it as ...",POSITIVE
9,2,Stopped by on a Mon evening after trying to di...,NEGATIVE


**Tokenization**

In [20]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [21]:
print(train_df["text"][5])
print(tokenizer.encode(train_df["text"][5]))
example_tokens = tokenizer.encode(
    train_df["text"][5], truncation=True, padding=True, add_special_tokens=True
)
print(tokenizer.decode(example_tokens))

This is a tiny Starbucks and it locations like this (although cute) makes you wonder if your really meant to hang out or just grab your coffee and leave. Leaving is always a good idea at this location anyway since you have a nice fountain in the back with benches and it is a central part of the Waterfront Shopping. \n\nStarbuck isn't my favorite coffee chain by any means. Is it just me or do all Starbuck coffees taste a little burnt and bitter? No matter how trendy, cool and upscale their establishments are I can't get around the yicky tasting bitterness of Staryucks regular coffees. Talk about over roasting a bean...Maybe something has changed with their regular coffee but I have not drank it in about a year. I am not one for soy caramel latte foofy stuff. Still I'll give the establishment tres estrellas for the fact that their espresso is acceptable and doesn't taste half as bad as the regular coffee bean.
[101, 2023, 2003, 1037, 4714, 29500, 1998, 2009, 5269, 2066, 2023, 1006, 2348,

In [22]:
input_ids = torch.tensor(example_tokens).unsqueeze(0)
print(input_ids)

tensor([[  101,  2023,  2003,  1037,  4714, 29500,  1998,  2009,  5269,  2066,
          2023,  1006,  2348, 10140,  1007,  3084,  2017,  4687,  2065,  2115,
          2428,  3214,  2000,  6865,  2041,  2030,  2074,  6723,  2115,  4157,
          1998,  2681,  1012,  2975,  2003,  2467,  1037,  2204,  2801,  2012,
          2023,  3295,  4312,  2144,  2017,  2031,  1037,  3835,  9545,  1999,
          1996,  2067,  2007, 19571,  1998,  2009,  2003,  1037,  2430,  2112,
          1997,  1996, 16317,  6023,  1012,  1032,  1050,  1032, 24978,  7559,
         24204,  3475,  1005,  1056,  2026,  5440,  4157,  4677,  2011,  2151,
          2965,  1012,  2003,  2009,  2074,  2033,  2030,  2079,  2035,  2732,
         24204,  4157,  2015,  5510,  1037,  2210, 11060,  1998,  8618,  1029,
          2053,  3043,  2129,  9874,  2100,  1010,  4658,  1998, 28276,  2037,
         17228,  2024,  1045,  2064,  1005,  1056,  2131,  2105,  1996, 12316,
         17413, 18767, 22364,  1997,  2732, 10513, 1

In [23]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [24]:
train_tokens = [
    tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    for text in train_df["text"]
]
ids = [torch.tensor(tokens).unsqueeze(0) for tokens in train_tokens]
print(ids)

[tensor([[  101,  1045,  2288,  1005,  2047,  1005, 13310,  2013,  2068,  1998,
          2306,  2048,  3134,  2288,  1037,  4257,  1012,  1045,  2165,  2026,
          2482,  2000,  1037,  2334, 15893,  2000,  2156,  2065,  1045,  2071,
          2131,  1996,  4920,  8983,  2098,  1010,  2021,  2027,  2056,  1996,
          3114,  1045,  2018,  1037,  4257,  2001,  2138,  1996,  3025,  8983,
          2018, 10676,  1011,  3524,  1010,  2054,  1029,  1045,  2074,  2288,
          1996, 12824,  1998,  2196,  2734,  2000,  2031,  2009,  8983,  2098,
          1029,  2023,  2001,  4011,  2000,  2022,  1037,  2047, 12824,  1012,
          1032,  9152,  2165,  1996, 12824,  2058,  2000, 13259,  1005,  1055,
          1998,  2027,  2409,  2033,  2008,  2619, 26136, 14890,  2094,  2026,
         12824,  1010,  2059,  2699,  2000,  8983,  2009,  1012,  2061,  2045,
          2024, 24501,  4765,  3993, 12824, 18296,  2545,  1029,  1045,  2424,
          2008,  2200,  9832,  1012,  2044,  9177, 

In [25]:
outputs = [model(input_ids=id) for id in ids]
predicted_labels = [torch.argmax(output.logits, dim=1) for output in outputs]
token_labels = [predicted_label[0].tolist() for predicted_label in predicted_labels]
train_df["new_label"] = pd.Series(token_labels)
train_df

Unnamed: 0,label,text,sentiment,new_label
0,0,I got 'new' tires from them and within two wee...,NEGATIVE,3
1,0,Don't waste your time. We had two different p...,NEGATIVE,3
2,0,All I can say is the worst! We were the only 2...,NEGATIVE,3
3,0,I have been to this restaurant twice and was d...,NEGATIVE,2
4,0,Food was NOT GOOD at all! My husband & I ate h...,NEGATIVE,3
5,2,This is a tiny Starbucks and it locations like...,POSITIVE,2
6,1,Typical Starbucks coffee chain. 2 things I don...,NEGATIVE,1
7,3,So.Much.Fun! \n\nI WISH I could play a song at...,POSITIVE,3
8,3,"My friend is a piano teacher, so I took it as ...",POSITIVE,3
9,2,Stopped by on a Mon evening after trying to di...,NEGATIVE,2
