<a href="https://colab.research.google.com/github/njsuriya/ML_clf_algorithms/blob/main/DistilBERT_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import torch
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
import numpy as np
import transformers as pt

In [6]:
df = pd.read_csv('/content/amazon_4class_dataset.csv')
df.head(3)

Unnamed: 0,text,labels
0,Besides the missing app functionality (check c...,Amazon Tap - Alexa-Enabled Portable Bluetooth ...
1,Love the speaker and the assistance of Alexa. ...,Amazon Tap - Alexa-Enabled Portable Bluetooth ...
2,Its very nice and not as big as the echo. Didn...,Amazon Tap - Alexa-Enabled Portable Bluetooth ...


In [7]:
lbe = LabelEncoder()
df['labels'] = lbe.fit_transform(df['labels'])

In [8]:
df.head()

Unnamed: 0,text,labels
0,Besides the missing app functionality (check c...,2
1,Love the speaker and the assistance of Alexa. ...,2
2,Its very nice and not as big as the echo. Didn...,2
3,"I bought these for a couple of reasons.First, ...",1
4,While I've purchased items from Amazon for yea...,1


In [9]:
df['labels'].value_counts()

2    100
1    100
0    100
3     70
Name: labels, dtype: int64

In [10]:
len(lbe.classes_)

4

In [11]:
model_class, tokenizer_class, pretrained_weights = (pt.DistilBertModel, pt.DistilBertTokenizer, 'distilbert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def tokenization(text_row):
  return tokenizer.encode(text_row,truncation=True,max_length=59)

encoded_text = df['text'].apply(lambda text_row: tokenization(text_row))

In [13]:
encoded_text.shape

(370,)

In [14]:
max_len = 0
for i in encoded_text.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in encoded_text.values])


In [15]:
padded.shape

(370, 59)

In [16]:
attention_mask = np.where(padded != 0, 1, 0)

In [17]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_layer = model(input_ids,attention_mask=attention_mask)

In [18]:
last_hidden_layer[0].shape

torch.Size([370, 59, 768])

In [19]:
feature = last_hidden_layer[0][:,0,:].numpy()

In [20]:
target = df['labels']

In [21]:
x_train,x_test,y_train,y_test = train_test_split(feature,target)

In [None]:
x_test[0]

In [22]:
clf = SVC()
clf.fit(x_train,y_train)

SVC()

In [23]:
clf.score(x_test,y_test)

0.9032258064516129

In [None]:
test_text = "The bass sound in the bluetooth was very good. One of the best product of Amazon."
tokenized_text = tokenizer.encode_plus(test_text,truncation=True,max_length=59,return_tensors='pt',return_attention_mask=True)

In [32]:
tokenized_text

{'input_ids': tensor([[  101,  1996,  3321,  2614,  1999,  1996,  2630, 19392,  2001,  2200,
          2204,  1012,  2028,  1997,  1996,  2190,  4031,  1997,  9733,  1012,
           102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [35]:
tokenized_text["input_ids"]

tensor([[  101,  1996,  3321,  2614,  1999,  1996,  2630, 19392,  2001,  2200,
          2204,  1012,  2028,  1997,  1996,  2190,  4031,  1997,  9733,  1012,
           102]])

In [39]:
with torch.no_grad():
  last_hidden_layer_test = model(tokenized_text["input_ids"],attention_mask=tokenized_text["attention_mask"])

In [40]:
last_hidden_layer_test

BaseModelOutput(last_hidden_state=tensor([[[-0.2622, -0.2979,  0.1229,  ..., -0.1293,  0.3295,  0.0475],
         [-0.1889, -0.5189, -0.1187,  ..., -0.1307,  0.3410, -0.4363],
         [ 0.5585, -0.3978,  0.1238,  ..., -0.1495,  0.1005, -0.5304],
         ...,
         [ 0.2522, -0.4642,  0.1757,  ...,  0.0649,  0.3181, -0.2913],
         [-0.2268, -0.8165, -0.2098,  ...,  0.2685,  0.3278, -0.5750],
         [ 0.6925,  0.0093, -0.4214,  ..., -0.1405, -0.3350, -0.4130]]]), hidden_states=None, attentions=None)

In [47]:
last_hidden_layer_test

BaseModelOutput(last_hidden_state=tensor([[[-0.2622, -0.2979,  0.1229,  ..., -0.1293,  0.3295,  0.0475],
         [-0.1889, -0.5189, -0.1187,  ..., -0.1307,  0.3410, -0.4363],
         [ 0.5585, -0.3978,  0.1238,  ..., -0.1495,  0.1005, -0.5304],
         ...,
         [ 0.2522, -0.4642,  0.1757,  ...,  0.0649,  0.3181, -0.2913],
         [-0.2268, -0.8165, -0.2098,  ...,  0.2685,  0.3278, -0.5750],
         [ 0.6925,  0.0093, -0.4214,  ..., -0.1405, -0.3350, -0.4130]]]), hidden_states=None, attentions=None)

In [46]:
clf.predict([last_hidden_layer_test[0][0][0].numpy()])

array([2])

In [48]:
lbe.inverse_transform(clf.predict([last_hidden_layer_test[0][0][0].numpy()]))

array(['Amazon Tap - Alexa-Enabled Portable Bluetooth Speaker'],
      dtype=object)