In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
bert = AutoModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import pandas as pd
df = pd.read_csv("/content/train.tsv",sep="\t",encoding='ISO-8859-1',
                 )

In [4]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
df.drop_duplicates(subset=["SentenceId"],inplace=True,keep="first")

In [6]:
df.shape

(8529, 4)

In [7]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
63,64,2,"This quiet , introspective and entertaining in...",4
81,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
116,117,4,A positively thrilling combination of ethnogra...,3
156,157,5,Aggressive self-glorification and a manipulati...,1


In [8]:
df.columns

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [9]:
seq_len = 50
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


In [10]:
def tokenize(text):
  tokens = tokenizer.encode_plus(text, max_length=seq_len,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
  return tokens['input_ids'], tokens['attention_mask']

In [11]:
df = df.iloc[1:1000]

In [12]:
import numpy as np
x_ids = np.zeros((len(df),seq_len))
x_mask = np.zeros((len(df),seq_len))


In [13]:
x_ids[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [14]:
for i, sent in enumerate(df["Phrase"]):
  x_ids[i,:],x_mask[i,:] = tokenize(sent)

In [15]:
arr = df["Sentiment"].values

In [16]:
labels = np.zeros((arr.size,arr.max()+1))
labels[np.arange(arr.size),arr] = 1

In [17]:
labels

array([[0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.]])

In [18]:
with open('twitter-xids.npy', 'wb') as f:
    np.save(f, x_ids)
with open('twitter-xmask.npy', 'wb') as f:
    np.save(f, x_mask)
with open('twitter-labels.npy', 'wb') as f:
    np.save(f, labels)

In [19]:
import tensorflow as tf
batch_size = 16
dataset = tf.data.Dataset.from_tensor_slices((x_ids,x_mask,labels))


In [20]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [21]:
dataset = dataset.map(map_func)

In [22]:
dataset = dataset.shuffle(1000).batch(batch_size)

In [23]:
len(dataset)

63

In [24]:
len([0 for batch in dataset])
  

63

In [25]:
ds_len = len([0 for batch in dataset])
split = .8
train = dataset.take(round(ds_len*split))
test = dataset.skip(round(ds_len*split))
 

In [26]:
del dataset

In [27]:
from transformers import AutoTokenizer,TFAutoModel
bert = TFAutoModel.from_pretrained("bert-base-cased")
input_ids = tf.keras.layers.Input(shape=(50,),name="input_ids",dtype="int32")
mask = tf.keras.layers.Input(shape=(50,),name="attention_mask",dtype="int32")

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [34]:
embedding = bert(input_ids,attention_mask=mask)[0]
x = tf.keras.layers.LSTM(64)(embedding)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dense(64,activation="relu")(x)
x = tf.keras.layers.Dropout(.1)(x)
y = tf.keras.layers.Dense(5,activation="softmax",name="outputs")(x)

model = tf.keras.Model(inputs =[input_ids,mask],outputs=y)


In [35]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7efe0ec71d60>,
 <keras.engine.input_layer.InputLayer at 0x7efd962bbd60>,
 <transformers.models.bert.modeling_tf_bert.TFBertModel at 0x7efd962b7070>,
 <keras.layers.rnn.lstm.LSTM at 0x7efd82b24730>,
 <keras.layers.normalization.batch_normalization.BatchNormalization at 0x7efd82b246a0>,
 <keras.layers.core.dense.Dense at 0x7efd832b8430>,
 <keras.layers.regularization.dropout.Dropout at 0x7efd83191a60>,
 <keras.layers.core.dense.Dense at 0x7efd8325a7f0>]

In [36]:
model.layers[2].trainable = False

In [31]:
## Training

## Training

In [37]:
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy("accuracy")
model.compile(optimizer=optimizer,loss=loss,metrics=[acc])

In [39]:
history = model.fit(train,validation_data =test,epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
