In [12]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import TFBertForSequenceClassification, BertTokenizer

# Load preprocessed data
df = pd.read_csv('preprocessed_tweets_dataset.csv')

# Mapping categorical labels to integers
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df['label'] = df['Sentiment'].map(label_map)

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['preprocessed_text'], df['label'], test_size=0.2, random_state=42)

# Convert train_texts and test_texts to lists of strings
train_texts = train_texts.astype(str).tolist()
test_texts = test_texts.astype(str).tolist()

# Initialize BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Ensure train_labels and test_labels are integers
train_labels = train_labels.astype(int).values
test_labels = test_labels.astype(int).values

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(len(train_texts)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained(model_name)

# Define model compile and training
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

# Train the model
history = model.fit(train_dataset, epochs=3, validation_data=test_dataset)

# Evaluate model performance
predictions = model.predict(test_dataset)
predicted_labels = tf.argmax(predictions.logits, axis=1)
print(classification_report(test_labels, predicted_labels.numpy(), target_names=['Negative', 'Neutral', 'Positive']))

# Save model training and evaluation information in documentation file (README.md)
model_training_info = """
### Model Training and Evaluation

- **Model:** Fine-tuned BERT for Sequence Classification
- **Metrics:** Accuracy, Precision, Recall, F1 Score
"""
with open('README.md', 'a') as readme_file:
    readme_file.write(model_training_info)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


2024-06-29 04:40:01.486975: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at sparse_xent_op.cc:103 : INVALID_ARGUMENT: Received a label value of 2 which is outside the valid range of [0, 2).  Label values: 1 2 1 1 0 2 2 2 2 2 2 1 0 2 2 2
2024-06-29 04:40:01.487116: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: Received a label value of 2 which is outside the valid range of [0, 2).  Label values: 1 2 1 1 0 2 2 2 2 2 2 1 0 2 2 2
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]]


InvalidArgumentError: Graph execution error:

Detected at node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/opt/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 701, in start

  File "/opt/anaconda3/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 195, in start

  File "/opt/anaconda3/lib/python3.11/asyncio/base_events.py", line 607, in run_forever

  File "/opt/anaconda3/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once

  File "/opt/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 523, in process_one

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 429, in do_execute

  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/opt/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell

  File "/opt/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell

  File "/opt/anaconda3/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/opt/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async

  File "/opt/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes

  File "/opt/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/var/folders/zr/9nk9f84s70qf1fpvdqxnh_j80000gn/T/ipykernel_21747/3618678403.py", line 54, in <module>

  File "/opt/anaconda3/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 1229, in fit

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1804, in fit

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1381, in step_function

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1370, in run_step

  File "/opt/anaconda3/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 1705, in train_step

  File "/opt/anaconda3/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 1706, in train_step

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/engine/compile_utils.py", line 269, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/engine/compile_utils.py", line 269, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/engine/compile_utils.py", line 277, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/losses.py", line 143, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/losses.py", line 270, in call

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/losses.py", line 2454, in sparse_categorical_crossentropy

  File "/opt/anaconda3/lib/python3.11/site-packages/tf_keras/src/backend.py", line 5777, in sparse_categorical_crossentropy

Received a label value of 2 which is outside the valid range of [0, 2).  Label values: 1 2 1 1 0 2 2 2 2 2 2 1 0 2 2 2
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_78226]