In [1]:
import tensorflow as tf 

print("GPU Available:", tf.config.list_physical_devices('GPU'))

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import numpy as np 
import pandas as pd 

def load_data_set() : 
    return pd.read_csv("train.csv")
    
data_set = load_data_set()
data_set_valuecount = data_set['Category'].value_counts()
data_set_valuecount.head()

data_set.head()
data_set["Category"].value_counts()/len(data_set)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_idx,test_idx in split.split(data_set,data_set["Category"]) : 
    strat_train_data = data_set.loc[train_idx]
    strat_test_data = data_set.loc[test_idx]
    
strat_train_data["Category"].value_counts() / len(data_set)
strat_train_data.head()

In [None]:
strat_train_category = strat_train_data["Category"].copy()
strat_train_text = strat_train_data.drop(["Category"],axis=1)

strat_train_text["Text"] = strat_train_text["Text"].str.lower()
strat_train_text["Text"]

In [None]:
import re 
punctuation_pattern = r'[^\w\s$]'
strat_train_text["Text"] = strat_train_text["Text"].str.replace(punctuation_pattern,'',regex=True)

import spacy

nlp = spacy.load('en_core_web_sm')

def tokenize_data(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]

strat_train_text["Tokens"] = strat_train_text["Text"].apply(tokenize_data)

strat_train_text.head()


In [None]:
def lemmatize_data (text):
    doc = nlp(text)
    return [token.lemma_ for token in doc ]

strat_train_text["Tokens"] = strat_train_text["Text"].apply(lemmatize_data)


In [None]:
import tensorflow_hub as hub
import tensorflow_text  
import tensorflow as tf
import numpy as np

bert_preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
    name="bert_preprocessor")
bert_model = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
    name="bert_encoder"
)

def bert_vectorization_batch(texts):
    texts_tensor = tf.convert_to_tensor(texts, dtype=tf.string)

    preprocessed = bert_preprocessor(texts_tensor)
    outputs = bert_model(preprocessed)

    return outputs["pooled_output"]

texts = strat_train_text["Tokens"].astype(str).tolist()

batch_size = 32
all_embeddings = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]

    try:
        batch_embeddings = bert_vectorization_batch(batch_texts)
        all_embeddings.append(batch_embeddings.numpy())
    except Exception as e:
        print(f"❌ Error in batch {i}-{i+batch_size}: {e}")
all_embeddings_np = np.vstack(all_embeddings)

strat_train_text["BERT_Embedding"] = list(all_embeddings_np)

