In [1]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score


In [2]:

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
print(device)

cuda


In [3]:
# Function to generate BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    texts = df['text'].tolist()
    labels = df['category'].tolist()

    # Encode categorical labels
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)

    return texts, labels, label_encoder

In [4]:
# Prepare data
file_path = '/content/output_chunk_4.csv'  # Replace with your dataset file
texts, labels, label_encoder = load_data(file_path)

In [5]:

# Generate embeddings
print("Generating BERT embeddings...")
embeddings = np.array([get_bert_embedding(text) for text in texts])

Generating BERT embeddings...


In [7]:

# Split dataset
# Split dataset into train (80%), val (18%), and test (2%)
X_temp, X_test, y_temp, y_test = train_test_split(embeddings, labels, test_size=0.02, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.184, random_state=42)

In [9]:
# Train XGBoost model
print("Training XGBoost model...")
clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train, y_train)


Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.



In [12]:
# Evaluate model
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print(report)

Validation Accuracy: 0.5396458814472671
               precision    recall  f1-score   support

         arts       0.53      0.46      0.49        85
        crime       0.59      0.62      0.60        78
     disaster       0.62      0.67      0.64        75
      economy       0.48      0.49      0.49        79
    education       0.70      0.54      0.61        79
environmental       0.65      0.65      0.65        80
       health       0.60      0.66      0.63        67
humanInterest       0.38      0.32      0.34        76
       labour       0.47      0.47      0.47        76
    lifestyle       0.48      0.60      0.54        73
        other       0.06      0.06      0.06        54
     politics       0.51      0.55      0.53        65
     religion       0.35      0.47      0.40        60
      science       0.55      0.78      0.65        59
       social       0.65      0.57      0.61        72
        sport       0.66      0.81      0.73        59
       unrest       0.61

In [13]:
# Save the model and label encoder
clf.save_model('./xgboost_text_classification.json')
pd.Series(label_encoder.classes_).to_csv('./xgboost_text_classification_label_classes.csv', index=False)

print("XGBoost model training complete and saved.")

XGBoost model training complete and saved.


## LLM based embedding

In [16]:
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load free LLM-based embedding model (Hugging Face Sentence Transformers)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    texts = df['text'].tolist()
    labels = df['category'].tolist()

    # Encode categorical labels
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)

    return texts, labels, label_encoder

In [17]:
# Prepare data
file_path = '/content/output_chunk_4.csv'  # Replace with your dataset file
texts, labels, label_encoder = load_data(file_path)

# Generate embeddings
print("Generating embeddings using SentenceTransformer...")
embeddings = np.array(model.encode(texts, show_progress_bar=True))

Generating embeddings using SentenceTransformer...


Batches:   0%|          | 0/225 [00:00<?, ?it/s]

In [18]:
# Split dataset into train (80%), val (18%), and test (2%)
X_temp, X_test, y_temp, y_test = train_test_split(embeddings, labels, test_size=0.02, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.184, random_state=42)

In [19]:
# Train XGBoost model
print("Training XGBoost model...")
clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train, y_train)

Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.



In [20]:
# Evaluate model
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print(report)


Validation Accuracy: 0.6204772902232486
               precision    recall  f1-score   support

         arts       0.76      0.60      0.67        85
        crime       0.74      0.65      0.69        78
     disaster       0.65      0.69      0.67        75
      economy       0.68      0.61      0.64        79
    education       0.75      0.62      0.68        79
environmental       0.81      0.82      0.82        80
       health       0.60      0.63      0.61        67
humanInterest       0.54      0.41      0.47        76
       labour       0.50      0.47      0.49        76
    lifestyle       0.51      0.67      0.58        73
        other       0.10      0.17      0.13        54
     politics       0.61      0.66      0.63        65
     religion       0.44      0.45      0.45        60
      science       0.68      0.76      0.72        59
       social       0.74      0.64      0.69        72
        sport       0.71      0.93      0.81        59
       unrest       0.62

In [21]:

# Save the model and label encoder
clf.save_model('./xgboost_text_classification.json')
pd.Series(label_encoder.classes_).to_csv('./xgboost_text_classification_label_classes.csv', index=False)

print("XGBoost model training complete and saved.")

XGBoost model training complete and saved.


## Mpnet embddings

In [22]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [23]:
# Generate embeddings
print("Generating embeddings using SentenceTransformer (all-mpnet-base-v2)...")
embeddings = np.array(model.encode(texts, show_progress_bar=True))

# Split dataset into train (80%), val (18%), and test (2%)
X_temp, X_test, y_temp, y_test = train_test_split(embeddings, labels, test_size=0.02, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.184, random_state=42)




Generating embeddings using SentenceTransformer (all-mpnet-base-v2)...


Batches:   0%|          | 0/225 [00:00<?, ?it/s]

In [24]:
# Train XGBoost model
print("Training XGBoost model...")
clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train, y_train)



Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.



In [25]:
# Evaluate model
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print(report)

# Save the model and label encoder
clf.save_model('./xgboost_text_classification.json')
pd.Series(label_encoder.classes_).to_csv('./xgboost_text_classification_label_classes.csv', index=False)

print("XGBoost model training complete and saved.")

Validation Accuracy: 0.6520400307929176
               precision    recall  f1-score   support

         arts       0.72      0.67      0.70        85
        crime       0.72      0.71      0.71        78
     disaster       0.73      0.72      0.72        75
      economy       0.65      0.61      0.63        79
    education       0.79      0.62      0.70        79
environmental       0.86      0.85      0.86        80
       health       0.62      0.70      0.66        67
humanInterest       0.55      0.45      0.49        76
       labour       0.64      0.61      0.62        76
    lifestyle       0.56      0.68      0.61        73
        other       0.05      0.06      0.05        54
     politics       0.62      0.65      0.63        65
     religion       0.51      0.52      0.51        60
      science       0.67      0.80      0.73        59
       social       0.74      0.71      0.72        72
        sport       0.67      0.85      0.75        59
       unrest       0.64

## LLAMA embeddinfgs

In [26]:
from transformers import AutoTokenizer, AutoModel

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Choose LLM model - Switch between Llama and Mistral
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"  # or "mistralai/Mistral-7B-Instruct-v0.1"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME, device_map="auto")

# Function to generate embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-67b9405a-570b19d8375b2a764af05386;17d67591-9ebb-4811-8e71-78c952c22fb1)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
# Generate embeddings
print(f"Generating embeddings using {MODEL_NAME}...")
embeddings = np.array([get_embedding(text) for text in texts])



In [None]:
# Split dataset into train (80%), val (18%), and test (2%)
X_temp, X_test, y_temp, y_test = train_test_split(embeddings, labels, test_size=0.02, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.184, random_state=42)

# Train XGBoost model
print("Training XGBoost model...")
clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train, y_train)




In [None]:
# Evaluate model
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print(report)

# Save the model and label encoder
clf.save_model('./xgboost_text_classification.json')
pd.Series(label_encoder.classes_).to_csv('./xgboost_text_classification_label_classes.csv', index=False)

print("XGBoost model training complete and saved.")