<a href="https://colab.research.google.com/github/mehmedkadric/bert-fine-tuning/blob/main/bert_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
cd /content/drive/My Drive/Upwork/Ivan/Theme Model/data

In [None]:
data = pd.read_csv('training_data.csv')
print(len(data))

# Tokenize the text data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_texts = tokenizer.batch_encode_plus(
    data['post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=256,  # Adjust this based on your data size and computational resources
    return_tensors='tf'
)

# Create input tensors
input_ids = encoded_texts['input_ids']
attention_mask = encoded_texts['attention_mask']

# Prepare labels for multilabel classification
labels = data[['is_personal', 'is_healthy_lifestyle', 'is_medical']].values

# Split data into train and test sets (you can also use cross-validation)
train_size = int(0.8 * len(data))
train_input_ids, test_input_ids = input_ids[:train_size], input_ids[train_size:]
train_attention_mask, test_attention_mask = attention_mask[:train_size], attention_mask[train_size:]
train_labels, test_labels = labels[:train_size], labels[train_size:]

31982


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
history = model.fit(
    {'input_ids': train_input_ids, 'attention_mask': train_attention_mask},
    train_labels,
    epochs=20,  # You can increase this number for better results, but it will take more time
    batch_size=32,
)


In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(
    {'input_ids': test_input_ids, 'attention_mask': test_attention_mask},
    test_labels
)

print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


In [None]:
# Save the model for later use
model.save_pretrained('final_model_20_epochs')


In [None]:
# Save the BERT tokenizer to 'tokenizer' directory
tokenizer.save_pretrained('tokenizer_20_epochs')

In [None]:
# Use the trained model to make predictions on the test data
predictions = model.predict({'input_ids': test_input_ids, 'attention_mask': test_attention_mask})

# Convert the logits to probabilities using the sigmoid function
predicted_probabilities = tf.nn.sigmoid(predictions.logits).numpy()

# Define a function to convert probabilities to labels
def get_labels_from_probabilities(probabilities, threshold=0.5):
    return [1 if p >= threshold else 0 for p in probabilities]

# Get the predicted labels for each test example
predicted_labels = [get_labels_from_probabilities(probs) for probs in predicted_probabilities]

# Check the column names to verify if the 'post' column exists
print(data.columns)

# Display some test examples with their true labels and predicted labels
for i in range(len(test_labels)):
    true_labels = [data.columns[3:][j] for j in range(3) if test_labels[i][j] == 1]
    predicted_labels_str = [data.columns[3:][j] for j in range(3) if predicted_labels[i][j] == 1]
    print(f"Example {i+1}:")
    # print("Text:", data['post'].iloc[i])
    print("True Labels:", true_labels)
    print("Predicted Labels:", predicted_labels_str)
    print()


# Testing part

In [None]:
!pip install tensorflow transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/My Drive/Upwork/Ivan/Theme Model/data

In [None]:
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load the BERT tokenizer from the 'tokenizer' folder
tokenizer = BertTokenizer.from_pretrained('tokenizer')

# Load the trained BERT model
model = TFBertForSequenceClassification.from_pretrained('final_model')

# Load the validation dataset from 'validation_set.csv'
validation_data = pd.read_csv('validation_set.csv')

# Preprocess the text data in the validation set using the loaded tokenizer
encoded_texts = tokenizer.batch_encode_plus(
    validation_data['post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='tf'
)
input_ids = encoded_texts['input_ids']
attention_mask = encoded_texts['attention_mask']

# Predict probabilities for each category using the trained BERT model
val_pred_probs = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})

# Convert the logits array to predicted probabilities
val_pred_probs = tf.nn.softmax(val_pred_probs.logits, axis=-1).numpy()


In [None]:
sentences = []
# Iterate through the test data and make predictions
threshold = 0.3
predicted_labels = (val_pred_probs >= threshold).astype(int)

test_data = []
test_data.append("ID\tSCORE\tACTUAL\tPREDICTED\tPOST")

overall_y_true = []
overall_y_pred = []

category_specific_y_true = []
category_specific_y_pred = []

for i, row in validation_data.iterrows():
    text = row['post']
    sentences.append(text)
    classes = row['classes'].split(',')
    category_specific_y_true.append(row['classes'])
    category_specific_y_pred.append(predicted_labels[i].tolist())
    # Convert classes to integer list
    true_labels = [int(c) for c in classes]
    labels_map = {'0,0,0': 0, '0,0,1': 1, '0,1,0': 2, '0,1,1': 3, '1,0,0': 4, '1,0,1': 5, '1,1,0': 6, '1,1,1': 7}
    predicted = predicted_labels[i].tolist()
    predicted = ",".join(map(str, predicted))
    overall_y_true.append(labels_map[row['classes']])
    overall_y_pred.append(labels_map[predicted])
    test_data.append(f"{i}\t{row['classes']==predicted}\t{row['classes']}\t{predicted}\t{sentences[i]}")
    print(f"{i}\t{row['classes']==predicted}\t{row['classes']}\t{predicted}\t{sentences[i]}")


# with open("test_data.tsv", "w") as file:
#     for item in test_data:
#         file.write(str(item) + "\n")
# print("File saved.")

In [None]:
def display_confusion_matrix(actual, predicted, category):
    cm = metrics.confusion_matrix(actual, predicted)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['0,0,0', '0,0,1', '0,1,0',
                                                                                     '0,1,1', '1,0,0', '1,0,1',
                                                                                     '1,1,0', '1,1,1'])
    cm_display.plot()
    plt.title(category)
    plt.show()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn import metrics
from matplotlib import pyplot as plt

overall_y_true_flat = overall_y_true
overall_y_pred_flat = overall_y_pred

overall_accuracy = accuracy_score(overall_y_true_flat, overall_y_pred_flat)
overall_f1 = f1_score(overall_y_true_flat, overall_y_pred_flat, average='weighted')
overall_precision = precision_score(overall_y_true_flat, overall_y_pred_flat, average='weighted')
overall_recall = recall_score(overall_y_true_flat, overall_y_pred_flat, average='weighted')
# Calculate overall confusion matrix
overall_cm = confusion_matrix(overall_y_true_flat, overall_y_pred_flat)
# Print overall metrics and confusion matrix
print("Overall Metrics:")
print("Accuracy:", overall_accuracy)
print("F1 Score:", overall_f1)
print("Precision:", overall_precision)
print("Recall:", overall_recall)
print("Overall Confusion Matrix:")
print(overall_cm)
print("Classification Report:")

display_confusion_matrix(overall_y_true_flat, overall_y_pred_flat, "BERT: Overall Metrics: (is_personal, is_healthy_lifestyle, is_medical)")

In [None]:
def category_metrics(category_index):
  category = ""
  if category_index == 0:
    category = "Personal"
  elif category_index == 1:
    category = "Healthy Lifestyle"
  else:
    category = "Medical"
  print("Category: " + category)
  y_true = [x.split(',')[category_index] for x in category_specific_y_true]
  y_pred = [",".join(map(str,x)).split(',')[category_index] for x in category_specific_y_pred]

  # Calculate confusion matrix
  cm = confusion_matrix(y_true, y_pred)
  # Print metrics and confusion matrix
  print(classification_report(y_true, y_pred, target_names=["0", "1"]))

  return [y_true, y_pred, "BERT: " + category]


def display_confusion_matrix(cms):
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    for i, (y_true, y_pred, category) in enumerate(cms):
        cm = metrics.confusion_matrix(y_true, y_pred)
        ax = axes[i]
        plot_confusion_matrix(cm, classes=[True, False], title=category, ax=ax)

    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(cm, classes, title, ax):
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Loop over data dimensions and create text annotations.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > cm.max() / 2 else "black")

In [None]:
cms = []
for i in range(3):
  print("==============================")
  cms.append(category_metrics(i))
display_confusion_matrix(cms)

In [None]:
import tensorflow as tf
print(tf.version.VERSION)

2.12.0
