# Sentiment analysis with transformers

1. Activate GPU and Install Dependencies

In [1]:
import torch
torch.cuda.is_available()

False

Load imdb dataset for training

In [2]:
from datasets import load_dataset
imdb = load_dataset("imdb")

Found cached dataset imdb (/Users/kartiksharma/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

Check one sample

In [3]:
train = imdb['train']
test = imdb['test']
print(train[0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

Insert some data in the database.

In [4]:
import pymongo
db = pymongo.MongoClient("mongodb://testmongodbuser:testmongodbpassword@localhost:27018/admin",) 
db.drop_database('test_db')
db = db['test_db']


Make it a pinnacledb!

In [5]:
import pinnacledb
from pinnacledb.misc.pinnacle import pinnacle

db = pinnacle(db)

INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.


In [6]:
db

<pinnacledb.datalayer.base.database.BaseDatabase at 0x7f8b83d1da60>

In [7]:
from pinnacledb.core.documents import Document as D
from pinnacledb.datalayer.mongodb.query import Collection

In [8]:
train_df = train.to_pandas()
train_df = train_df.head(20)

In [9]:
data = [D({"text":sample[0], "label":sample[1]}) for sample in train_df.to_numpy()]

In [10]:
len(data)

20

In [11]:
db.execute(Collection('documents').insert_many(data))

INFO:root:found 0 uris


(<pymongo.results.InsertManyResult at 0x7f8b83d34580>,
 TaskWorkflow(database=<pinnacledb.datalayer.base.database.BaseDatabase object at 0x7f8b83d1da60>, G=<networkx.classes.digraph.DiGraph object at 0x7f8b83d34b80>))

Create a tokenizer

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Create a preproces function with the tokenizer.

In [13]:
def transform(sample):
    tokenizer_fn = lambda x: tokenizer(x, truncation=True)
    tokenized_data = tokenizer_fn(sample['text'])
    sample.update(**tokenized_data)
    return sample

In [14]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Create Model (DistilBert)

It is a smaller version of BERT

In [15]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.we

Define evaluation metrics

In [16]:
Define the evaluation metrics
import numpy as np
# from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
#     load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = lambda x, y: sum([xx == yy for xx, yy in zip(x, y)]) / len(x)
    
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [17]:
from pinnacledb.models.transformers.wrapper import TransformersTrainerConfiguration, Pipeline

Create training arguments

In [18]:
from transformers import TrainingArguments, Trainer
repo_name = "pinnacledb-sentiment-analysis"
training_args = TransformersTrainerConfiguration(
    identifier=repo_name,
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    use_mps_device=True
)

In [19]:
device = torch.device('mps')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Create Pipeline with the model and tokenizer.

In [20]:
trainer = Pipeline(
    identifier='my-sentiment-analysis',
    tokenizer=transform,
    object=model,
    train_X='text',
    train_y='label',
    device="mps" # Note: Only valid for mac m1 devices!
)

Training the model.

In [21]:
from pinnacledb.core.dataset import Dataset
trainer.fit(
    X='text',
    y='label',
    db=db,
    select=Collection('documents').find(),
    configuration=training_args,
    validation_sets=[
        Dataset(
            identifier='my-eval',
            select=Collection(name='documents').find({'_fold': 'valid'}),
            db=db,
        )
    ],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    data_prefetch = False,
    prefetch_size=5
)                                                                            

INFO:root:Created dataset/my-eval/0
INFO:root:Created model/my-sentiment-analysis/0
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


  load_accuracy = load_metric("accuracy")
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'eval_loss': 0.4243570864200592,
 'eval_accuracy': 1.0,
 'eval_f1': 0.0,
 'eval_runtime': 2.8956,
 'eval_samples_per_second': 0.345,
 'eval_steps_per_second': 0.345,
 'epoch': 1.0}

Do a single prediction on the model.

In [None]:
output = trainer.predict("This movie sucks!")
print(output)