## Text Classification with BERT
#### Below code checks the availability of the GPU

In [16]:
import tensorflow as tf 

print("GPU Available:", tf.config.list_physical_devices('GPU'))

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]



#### This cell deal with loading of data set and taking a peek into the data 

In [17]:
import numpy as np 
import pandas as pd 

def load_data_set() : 
    return pd.read_csv("/Users/pmanthan/Desktop/ML Practice /train.csv")
    
data_set = load_data_set()
data_set_valuecount = data_set['Category'].value_counts()
data_set_valuecount.head()

data_set.head()
data_set["Category"].value_counts()/len(data_set)

Category
Education                    0.030622
Mechanical Engineer          0.028680
Electrical Engineering       0.028680
Consultant                   0.027485
Civil Engineer               0.027186
Sales                        0.027186
Management                   0.026962
Human Resources              0.026888
Digital Media                0.026738
Accountant                   0.026141
Java Developer               0.025991
Building and Construction    0.025767
Operations Manager           0.025767
Architecture                 0.025693
Testing                      0.025693
Business Analyst             0.025394
Aviation                     0.025394
Finance                      0.025319
SQL Developer                0.025245
Public Relations             0.025170
Health and Fitness           0.024796
Arts                         0.024796
Network Security Engineer    0.024647
DotNet Developer             0.024572
Apparel                      0.023900
Banking                      0.023452
Aut


#### This snippet divides the data into training and test data based on stratifies splitting so that the composure of the original data is maintained

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_idx,test_idx in split.split(data_set,data_set["Category"]) : 
    strat_train_data = data_set.loc[train_idx]
    strat_test_data = data_set.loc[test_idx]
    
strat_train_data["Category"].value_counts() / len(data_set)
strat_train_data.head()

Unnamed: 0,Category,Text
12964,SQL Developer,jessica claire 100 montgomery st 10th floor 55...
2256,Public Relations,robert smith public relations specialist perso...
6471,Architecture,jessica claire 100 montgomery st 10th floor 55...
10408,Human Resources,jessica claire montgomery street san francisco...
4632,Food and Beverages,director food beverage robert smith phone 123 ...


#### Separates the target variables from the data 

In [19]:
strat_train_category = strat_train_data["Category"].copy()
strat_train_text = strat_train_data.drop(["Category"],axis=1)

strat_train_text["Text"] = strat_train_text["Text"].str.lower()
strat_train_text["Text"]

12964    jessica claire 100 montgomery st 10th floor 55...
2256     robert smith public relations specialist perso...
6471     jessica claire 100 montgomery st 10th floor 55...
10408    jessica claire montgomery street san francisco...
4632     director food beverage robert smith phone 123 ...
                               ...                        
12495    jessica claire resumesampleexamplecom 555 4321...
374      alejandra arts alejandraartsgmailcom 563123456...
3971     robert smith creative designer phone 123 456 7...
6366     jessica claire resumesampleexamplecom 555 4321...
11463    jessica claire montgomery street san francisco...
Name: Text, Length: 10711, dtype: object

#### In this part of code i used regex for the text cleaning and spacy for the tokenization and stop word removal

In [20]:
import re 
punctuation_pattern = r'[^\w\s$]'
strat_train_text["Text"] = strat_train_text["Text"].str.replace(punctuation_pattern,'',regex=True)

import spacy

nlp = spacy.load('en_core_web_sm')

def tokenize_data(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]

strat_train_text["Tokens"] = strat_train_text["Text"].apply(tokenize_data)

strat_train_text.head()


Unnamed: 0,Text,Tokens
12964,jessica claire 100 montgomery st 10th floor 55...,"[jessica, claire, 100, montgomery, st, 10th, f..."
2256,robert smith public relations specialist perso...,"[robert, smith, public, relations, specialist,..."
6471,jessica claire 100 montgomery st 10th floor 55...,"[jessica, claire, 100, montgomery, st, 10th, f..."
10408,jessica claire montgomery street san francisco...,"[jessica, claire, montgomery, street, san, fra..."
4632,director food beverage robert smith phone 123 ...,"[director, food, beverage, robert, smith, phon..."


#### Lemmatixation of data into its base form using the spacy pipeline

In [21]:
def lemmatize_data (text):
    doc = nlp(text)
    return [token.lemma_ for token in doc ]

strat_train_text["Tokens"] = strat_train_text["Text"].apply(lemmatize_data)


#### Conversion of the text into word embeddings using the bert model and also splitting the data into training and validation set 

In [23]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import numpy as np
import pandas as pd

os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub_cache"

bert_preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
    name="bert_preprocessor"
)

bert_model = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
    name="bert_encoder"
)

def bert_vectorization_batch(texts):
    texts_tensor = tf.convert_to_tensor(texts, dtype=tf.string)
    preprocessed = bert_preprocessor(texts_tensor)
    outputs = bert_model(preprocessed)
    return outputs["pooled_output"]

texts = strat_train_text["Tokens"].astype(str).tolist()
batch_size = 32
all_embeddings = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    try:
        batch_embeddings = bert_vectorization_batch(batch_texts)
        all_embeddings.append(batch_embeddings.numpy())
    except Exception as e:
        print(f"❌ Error in batch {i}-{i+batch_size}: {e}")

all_embeddings_np = np.vstack(all_embeddings)
strat_train_text["BERT_Embedding"] = list(all_embeddings_np)

strat_train_text_val = strat_train_text.iloc[10000:10712]
strat_train_category_val = strat_train_category.iloc[10000:10712]
strat_train_text_ = strat_train_text.iloc[:10000]
strat_train_category_ = strat_train_category.iloc[:10000]


2025-05-28 21:36:36.064563: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-05-28 21:36:36.064676: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-05-28 21:36:36.064975: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
I0000 00:00:1748448396.065256 3413407 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1748448396.065922 3413407 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-05-28 21:36:36.671391: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


#### This code deals with imbalance nature of the data set i found out the class weights so that the model than give more weight for the class with less number 


In [24]:
from sklearn.utils.class_weight import compute_class_weight 

class_weight = compute_class_weight(class_weight='balanced',classes = np.unique(data_set["Category"]),y=data_set["Category"])
class_weight_dict = dict(enumerate(class_weight))
print("Class weights", class_weight_dict)

Class weights {0: np.float64(0.8896345514950166), 1: np.float64(1.0700071925197794), 2: np.float64(1.0627033891578697), 3: np.float64(0.9730377906976744), 4: np.float64(0.905151433207139), 5: np.float64(0.9378677500700476), 6: np.float64(0.9947990192436288), 7: np.float64(0.9158002735978112), 8: np.float64(1.5338526749914079), 9: np.float64(0.9916308695008147), 10: np.float64(6.6249381494309745), 11: np.float64(0.9025278058645096), 12: np.float64(0.9158002735978112), 13: np.float64(0.8554178379759775), 14: np.float64(0.8461198179979778), 15: np.float64(1.0413782375359726), 16: np.float64(1.211564564292824), 17: np.float64(1.2068685776095187), 18: np.float64(1.0774120865856602), 19: np.float64(0.8697544497856308), 20: np.float64(0.9464197356329964), 21: np.float64(1.0590887517797818), 22: np.float64(0.7594441293250142), 23: np.float64(0.8108648255813954), 24: np.float64(0.918501749331138), 25: np.float64(1.922049956933678), 26: np.float64(0.9378677500700476), 27: np.float64(0.8649224806

#### Data type checking due to some dtype error showed by ig 

In [25]:
isinstance(strat_train_category_, pd.DataFrame)
strat_train_category_.head()
strat_train_category_val.info()


<class 'pandas.core.series.Series'>
Index: 711 entries, 2455 to 11463
Series name: Category
Non-Null Count  Dtype 
--------------  ----- 
711 non-null    object
dtypes: object(1)
memory usage: 11.1+ KB


#### Preparing the labels for the training of model encoding them into vectors 

In [26]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(strat_train_category_)
y_val_int = label_encoder.transform(strat_train_category_val)

y_train_cat = to_categorical(y_train_int)
y_val_cat = to_categorical(y_val_int)

X_train = np.stack(strat_train_text_["BERT_Embedding"].values)
X_val = np.stack(strat_train_text_val["BERT_Embedding"].values)

In [27]:
strat_train_text_["BERT_Embedding"].apply(lambda x: np.shape(x))
y_train_cat[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0.])

#### Building of neural network for the model and compiling it and training it with the taining data set with the respective loss and optimizer

In [None]:

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

metrics = [tf.keras.metrics.Accuracy(),
           tf.keras.metrics.Precision(),
           tf.keras.metrics.Recall(),
           tf.keras.metrics.F1Score()]

nn_model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=[512,]),
                                tf.keras.layers.Dense(100,activation="relu"),
                                tf.keras.layers.Dense(100,activation="relu"),
                                tf.keras.layers.Dense(43,activation="softmax")])

nn_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=optimizer, metrics=metrics)
nn_model.fit(X_train,y_train_cat,
             epochs=30,batch_size=40,validation_data=(X_val,y_val_cat),
             class_weight=class_weight_dict)

In [35]:
strat_test_category = strat_test_data["Category"].copy()
strat_test_text = strat_test_data.drop(["Category"],axis=1)

nn_model.evaluate(strat_test_text)  

nn_model.predict(X_test)


