In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle
import re
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/preprocessed_data.csv'
df = pd.read_csv(file_path)

In [None]:
label_encoder = LabelEncoder()
df['subreddit'] = label_encoder.fit_transform(df['subreddit'])
num_classes = len(label_encoder.classes_)

In [None]:
df.pop('Unnamed: 0')

0              0
1              1
2              2
3              3
4              4
           ...  
152884    153319
152885    153320
152886    153321
152887    153322
152888    153323
Name: Unnamed: 0, Length: 152889, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['subreddit'], test_size=0.2, random_state=42)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def convert_data_to_bert_input(texts, labels):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    labels = tf.convert_to_tensor(labels)
    return input_ids, attention_masks, labels

In [None]:
train_input_ids, train_attention_masks, train_labels = convert_data_to_bert_input(X_train, y_train)
test_input_ids, test_attention_masks, test_labels = convert_data_to_bert_input(X_test, y_test)

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
history = model.fit(
    [train_input_ids, train_attention_masks],
    train_labels,
    epochs=2,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/2


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/2


In [None]:
predictions = model.predict([test_input_ids, test_attention_masks])
pred_labels = np.argmax(predictions.logits, axis=1)
print("Accuracy:", accuracy_score(y_test, pred_labels))
print("Classification Report:\n", classification_report(y_test, pred_labels))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_labels))


Accuracy: 0.733501210020276
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.81      0.81      5220
           1       0.96      0.98      0.97      5216
           2       0.55      0.49      0.52      5287
           3       0.79      0.78      0.79      4645
           4       0.73      0.52      0.60      5163
           5       0.59      0.83      0.69      5047

    accuracy                           0.73     30578
   macro avg       0.74      0.74      0.73     30578
weighted avg       0.74      0.73      0.73     30578

Confusion Matrix:
 [[4216   24  279  142  399  160]
 [   6 5117   14   22    2   55]
 [ 185   42 2606  434  407 1613]
 [  73   94  396 3624   72  386]
 [ 612   29  953  198 2674  697]
 [  44   47  481  153  130 4192]]


In [None]:
model.save_pretrained('bert_reddit_model')
tokenizer.save_pretrained('bert_reddit_tokenizer')

('bert_reddit_tokenizer/tokenizer_config.json',
 'bert_reddit_tokenizer/special_tokens_map.json',
 'bert_reddit_tokenizer/vocab.txt',
 'bert_reddit_tokenizer/added_tokens.json')

In [None]:
import string
def preprocess_text(text):
  text = re.sub(r'<.*?>', '', text)  # Remove tags
  text = re.sub(r'http\S+', '', text)  # Remove URLs
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
  text = re.sub('\n', '', text)  # Remove newline characters
  text = re.sub('[0-9]+', '', text)  # Remove numbers
  text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
  text = text.lower()  # Lowercase the text
  return text


In [None]:
def predict_text_label(text):
    text = preprocess_text(text)
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf',
        truncation=True
    )
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    prediction = model.predict([input_ids, attention_mask])
    label = np.argmax(prediction.logits, axis=1)[0]
    return label_encoder.inverse_transform([label])[0]

In [None]:
sample_text = "I'm feeling like to go somewhere far away and never come back. It's feels everyone is prejudices and biased and the world is not a fair place "
print("Predicted label:", predict_text_label(sample_text))

Predicted label: depression


In [None]:

import pickle

with open('bert_reddit_model.pkl', 'wb') as f:
  pickle.dump(model, f)

with open('bert_reddit_tokenizer.pkl', 'wb') as f:
  pickle.dump(tokenizer, f)