# Sentiment Analysis - BERT

This notebooks contains the training, evaluation and predictions of a classification for sentiment analysis using BERT. 

### Imports

In [None]:
import pandas as pd
import numpy as np

In [None]:

from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf

### Read and plot target

In [None]:
df = pd.read_csv('../data.csv')
df.head()

In [None]:
df.Sentiment.value_counts()

In [None]:
df.Sentiment=df.Sentiment.replace({'positive':2, 'neutral':1, 'negative':0})

### Training and experimentation

In [None]:
X_train, x_test, Y_train, y_test = train_test_split( df.Sentence, df.Sentiment, test_size=0.2, random_state=13)
print(f'Train shapes {X_train.shape}, {Y_train.shape}')
print(f'Test shapes {x_test.shape}, {y_test.shape}')

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
max_len= 128
# Tokenize and encode the sentences
X_train_encoded = tokenizer.batch_encode_plus(X_train.tolist(),
											padding=True, 
											truncation=True,
											max_length = max_len,
											return_tensors='tf')

X_test_encoded = tokenizer.batch_encode_plus(x_test.tolist(), 
											padding=True, 
											truncation=True,
											max_length = max_len,
											return_tensors='tf')



In [None]:
k = 7
print('Training Comments -->>',X_train.reset_index().iloc[k,1])
print('\nInput Ids -->>\n',X_train_encoded['input_ids'][k])
print('\nDecoded Ids -->>\n',tokenizer.decode(X_train_encoded['input_ids'][k]))
print('\nAttention Mask -->>\n',X_train_encoded['attention_mask'][k])
print('\nLabels -->>',Y_train[k])


In [None]:
# Intialize the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


In [None]:
# Compile the model with an appropriate optimizer, loss function, and metrics
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [None]:
# Step 5: Train the model
history = model.fit(
	[X_train_encoded['input_ids'], X_train_encoded['token_type_ids'], X_train_encoded['attention_mask']],
	Y_train,
	batch_size=64,
	epochs=3
)


In [None]:
#Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(
	[X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']],
	y_test
)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')


### Save model

In [None]:
model.save_pretrained('artifacts/model_exp_4_bert')

In [None]:
model_loaded = TFBertForSequenceClassification.from_pretrained('artifacts/model_exp_4_bert')

In [None]:
y_pred = model_loaded.predict([X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']])