In [11]:
import tensorflow as tf 

print("GPU Available:", tf.config.list_physical_devices('GPU'))

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import numpy as np 
import pandas as pd 

def load_data_set() : 
    return pd.read_csv("/Users/pmanthan/Desktop/ML Practice /train.csv")
    
data_set = load_data_set()
data_set_valuecount = data_set['Category'].value_counts()
data_set_valuecount.head()

data_set.head()
data_set["Category"].value_counts()/len(data_set)

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_idx,test_idx in split.split(data_set,data_set["Category"]) : 
    strat_train_data = data_set.loc[train_idx]
    strat_test_data = data_set.loc[test_idx]
    
strat_train_data["Category"].value_counts() / len(data_set)
strat_train_data.head()

Unnamed: 0,Category,Text
12964,SQL Developer,jessica claire 100 montgomery st 10th floor 55...
2256,Public Relations,robert smith public relations specialist perso...
6471,Architecture,jessica claire 100 montgomery st 10th floor 55...
10408,Human Resources,jessica claire montgomery street san francisco...
4632,Food and Beverages,director food beverage robert smith phone 123 ...


In [14]:
strat_train_category = strat_train_data["Category"].copy()
strat_train_text = strat_train_data.drop(["Category"],axis=1)

strat_train_text["Text"] = strat_train_text["Text"].str.lower()
strat_train_text["Text"]

12964    jessica claire 100 montgomery st 10th floor 55...
2256     robert smith public relations specialist perso...
6471     jessica claire 100 montgomery st 10th floor 55...
10408    jessica claire montgomery street san francisco...
4632     director food beverage robert smith phone 123 ...
                               ...                        
12495    jessica claire resumesampleexamplecom 555 4321...
374      alejandra arts alejandraartsgmailcom 563123456...
3971     robert smith creative designer phone 123 456 7...
6366     jessica claire resumesampleexamplecom 555 4321...
11463    jessica claire montgomery street san francisco...
Name: Text, Length: 10711, dtype: object

In [15]:
import re 
punctuation_pattern = r'[^\w\s$]'
strat_train_text["Text"] = strat_train_text["Text"].str.replace(punctuation_pattern,'',regex=True)

import spacy

nlp = spacy.load('en_core_web_sm')

def tokenize_data(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]

strat_train_text["Tokens"] = strat_train_text["Text"].apply(tokenize_data)

strat_train_text.head()


Unnamed: 0,Text,Tokens
12964,jessica claire 100 montgomery st 10th floor 55...,"[jessica, claire, 100, montgomery, st, 10th, f..."
2256,robert smith public relations specialist perso...,"[robert, smith, public, relations, specialist,..."
6471,jessica claire 100 montgomery st 10th floor 55...,"[jessica, claire, 100, montgomery, st, 10th, f..."
10408,jessica claire montgomery street san francisco...,"[jessica, claire, montgomery, street, san, fra..."
4632,director food beverage robert smith phone 123 ...,"[director, food, beverage, robert, smith, phon..."


In [16]:
def lemmatize_data (text):
    doc = nlp(text)
    return [token.lemma_ for token in doc ]

strat_train_text["Tokens"] = strat_train_text["Text"].apply(lemmatize_data)


In [None]:
import tensorflow_hub as hub
import tensorflow_text  
import tensorflow as tf
import numpy as np

bert_preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
    name="bert_preprocessor")
bert_model = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
    name="bert_encoder"
)

def bert_vectorization_batch(texts):
    texts_tensor = tf.convert_to_tensor(texts, dtype=tf.string)

    preprocessed = bert_preprocessor(texts_tensor)
    outputs = bert_model(preprocessed)

    return outputs["pooled_output"]

texts = strat_train_text["Tokens"].astype(str).tolist()

batch_size = 32
all_embeddings = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]

    try:
        batch_embeddings = bert_vectorization_batch(batch_texts)
        all_embeddings.append(batch_embeddings.numpy())
    except Exception as e:
        print(f"❌ Error in batch {i}-{i+batch_size}: {e}")
all_embeddings_np = np.vstack(all_embeddings)

strat_train_text["BERT_Embedding"] = list(all_embeddings_np)

strat_train_text_val = strat_train_text[10000:10712]
strat_train_category_val = strat_train_category[10000:10712]
strat_train_text_,strat_train_category_ = strat_train_text[:10000],strat_train_category[:10000]

In [None]:
from sklearn.utils.class_weight import compute_class_weight 

class_weight = compute_class_weight(class_weight='balanced',classes = np.unique(data_set["Category"]),y=data_set["Category"])
class_weight_dict = dict(enumerate(class_weight))
print("Class weights", class_weight_dict)

In [None]:
isinstance(strat_train_category_, pd.DataFrame)
strat_train_category_.head()
strat_train_category_val.info()


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(strat_train_category_)
y_val_int = label_encoder.transform(strat_train_category_val)

y_train_cat = to_categorical(y_train_int)
y_val_cat = to_categorical(y_val_int)

X_train = np.stack(strat_train_text_["BERT_Embedding"].values)
X_val = np.stack(strat_train_text_val["BERT_Embedding"].values)

In [None]:
strat_train_text_["BERT_Embedding"].apply(lambda x: np.shape(x))
y_train_cat[0]

In [None]:
from focal_loss import CategoricalFocalLoss

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

metrics = [tf.keras.metrics.Accuracy(),
           tf.keras.metrics.Precision(),
           tf.keras.metrics.Recall(),
           tf.keras.metrics.F1Score()]

nn_model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=[512,]),
                                tf.keras.layers.Dense(100,activation="relu"),
                                tf.keras.layers.Dense(100,activation="relu"),
                                tf.keras.layers.Dense(43,activation="softmax")])

nn_model.compile(loss=CategoricalFocalLoss(alpha=1.0, gamma=2.0), optimizer=optimizer, metrics=metrics)
nn_model.fit(X_train,y_train_cat,
             epochs=30,batch_size=40,validation_data=(X_val,y_val_cat),
             class_weight=class_weight_dict)

In [None]:
print(tf.keras.metrics.F1Score(nn_model))