In [1]:
!pip install -q transformers

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer,T5TokenizerFast
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [3]:
df = pd.read_csv('/content/sample_data/Sentiment_analysis_data.csv')


In [4]:
df = df.sample(frac=1)
df.tail(10)

Unnamed: 0,Comment,Tag
160,তা বাংলাদেশের যারা চুরির সাথে জড়িত তাদের শাস্ত...,1
8556,তাহলে বর্তমান অর্থমন্ত্রী কি দুর্বল? আসলে শুধু...,0
517,আহারে! লাখ তিনেক টাকার জন্য কত হামাগুড়ি দিচ্ছি...,0
10303,আমাদের সৌভাগ্য শচীনের পর বিরাটকে পেয়েছি। তা না...,1
4631,নির্বাচনে শান্তিপূর্ন মিছিল মিটিং এর সুযোগ থাক...,0
17149,অসাধারণ একটা নাটক নিশো ভাইয়ের কোনো তুলনা হয়না...,1
9911,না তারা পারেনি। ১৫৬ তে অল আউট। আজিজ প্যাটেল ৩ ...,0
13307,রেলের টাকা যায় সরকারী লোকদের পকেটে,0
8470,ডিজিটাল হুন্ডি বন্ধ না হলে এই ধারা অব্যহত থাকবে,1
8520,২০১৪ সালের ফেব্রুয়ারি মাসে আমি মোটামুটি ফাঁকাই...,2


In [5]:
df['Tag'].value_counts()

1    10269
0     9747
2     2951
Name: Tag, dtype: int64

In [6]:
!pip install sentencepiece
!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2022-08-29 13:50:37--  https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 278779 (272K) [text/plain]
Saving to: ‘botchan.txt.1’


2022-08-29 13:50:37 (57.0 MB/s) - ‘botchan.txt.1’ saved [278779/278779]



In [7]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5",use_fast=False)

In [8]:
df['Comment'].iloc[0]

'উদ্দিপ্ত হবার জন্যে উপলক্ষ খুঁজে নেওয়া মন্দ না। তবে কাজের থেকে যেন কথা বেশি না হিয়ে যায় সেটাও বিবেচ্য। আবেগের অনিয়ন্ত্রিত প্রকাশের ধাক্কা সবাই নিয়ন্ত্রন করতে পারে না। মুশফিক এর আগে ২-৩ দফায় দেখিয়েছে যে এই কারনে তীরে এসে তার তরী ডোবে। আশা  করা যায় যে সে এর থেকে শিখেছে। বাস্তবে ওরকম পরিস্থিতীতে তার সাফল্য আবার না দেখা পর্যন্ত তাকেও এটা মানতে হবে যে তার কাজ এখন অনেক বাকি।'

In [9]:
token = tokenizer.encode_plus(
    df['Comment'].iloc[0],
    max_length = 256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensor='tf'
)

Keyword arguments {'return_tensor': 'tf'} not recognized.


In [10]:
x_input_ids = np.zeros((len(df),256))
x_attention_mask = np.zeros((len(df),256))


In [11]:
def generate_training_data(df,ids,masks,tokenizer):
  for i, text in tqdm(enumerate(df['Comment'])):
    tokenized_text = tokenizer.encode_plus(
        text,
        max_length=256,
        truncation=True,
        padding = 'max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    ids[i, :] = tokenized_text.input_ids
    masks[i,:]= tokenized_text.attention_mask
  return ids,masks

In [12]:
x_input_ids,x_attention_mask=generate_training_data(df,x_input_ids,x_attention_mask,tokenizer)

0it [00:00, ?it/s]

In [13]:
x_input_ids

array([[ 3508.,  1132.,  5352., ...,     0.,     0.,     0.],
       [   63.,   426., 12572., ...,     0.,     0.,     0.],
       [  481.,  1080.,   167., ...,     0.,     0.,     0.],
       ...,
       [ 8617.,   164.,    54., ...,     0.,     0.,     0.],
       [ 3405.,  1534.,  5733., ...,     0.,     0.,     0.],
       [ 2435.,   308.,  1814., ...,     0.,     0.,     0.]])

In [14]:
labels = np.zeros((len(df),3))

In [15]:
labels[np.arange(len(df)),df['Tag'].values]=1

In [16]:
dataset = tf.data.Dataset.from_tensor_slices((x_input_ids,x_attention_mask,labels))

In [17]:
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [18]:
def SentimentDatasetMapFunction(input_ids,attention_mask,labels):
  return{
      'input_ids':input_ids,
      'attention_mask':attention_mask
  }, labels

In [19]:
dataset = dataset.map(SentimentDatasetMapFunction)

In [20]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [21]:
dataset = dataset.shuffle(10000).batch(16,drop_remainder=True)

In [22]:
p=14
train_size = int((len(df)//16)*p)

In [23]:
train_size

20090

In [24]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [25]:
from transformers import TFBertModel

In [26]:
bert_Model = TFBertModel.from_pretrained("csebuetnlp/banglat5",from_pt=True)

You are using a model of type t5 to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/945M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['decoder.block.8.layer.0.SelfAttention.k.weight', 'encoder.block.4.layer.0.SelfAttention.o.weight', 'decoder.block.5.layer.2.DenseReluDense.wi_1.weight', 'encoder.block.2.layer.0.SelfAttention.v.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'encoder.block.1.layer.1.DenseReluDense.wi_0.weight', 'decoder.block.9.layer.0.layer_norm.weight', 'encoder.block.7.layer.0.layer_norm.weight', 'encoder.block.10.layer.0.SelfAttention.v.weight', 'encoder.embed_tokens.weight', 'decoder.block.7.layer.1.EncDecAttention.o.weight', 'decoder.block.3.layer.0.SelfAttention.k.weight', 'encoder.block.5.layer.1.layer_norm.weight', 'decoder.block.4.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.10.layer.2.DenseReluDense.wi_0.weight', 'encoder.block.6.layer.1.DenseReluDense.wo.weight', 'decoder.block.7.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.o.weight', 'encoder

In [27]:
input_ids = tf.keras.layers.Input(shape=(256,),name='input_ids',dtype='int32')
attention_masks = tf.keras.layers.Input(shape=(256,),name='attention_mask',dtype='int32')

bert_embds = bert_Model.bert(input_ids,attention_mask=attention_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512,activation='relu',name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(3,activation='softmax',name='output_layer')(intermediate_layer)

model = tf.keras.Model(inputs=[input_ids,attention_masks],outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  110715648   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [28]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5,decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [29]:
model.compile(optimizer=optim,loss=loss_func,metrics=[acc])

In [30]:
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)



In [31]:
model.save('sentiment_model')



In [32]:
loaded_model = tf.keras.models.load_model('sentiment_model')