In [1]:
import pandas as pd 
import numpy as np
from tqdm.auto import tqdm
import torch 
import torch.nn as nn
import torch.optim as optim 
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer , DistilBertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from transformers import BertModel, BertTokenizer
import torch.nn as nn 
from torch.utils.data  import Dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [2]:
class SubjectDataset(Dataset):
    def __init__(self , df):
        self.df=df
        self.maxlen=256
        self.tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
    def __len__(self):
        return len(self.df)
    def __getitem__(self , index):
        sample_title= str(self.df['title'].iloc[index])
        sample_content= str(self.df['text'].iloc[index])
        sample = sample_title + " " + sample_content 
        encodings = self.tokenizer.encode_plus(
        sample,
        add_special_tokens=True,
        max_length=self.maxlen,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt' 
        )
        label = torch.tensor(self.df['label'].iloc[index], dtype=torch.long)
        return {
            'input_ids': encodings['input_ids'].flatten(),  
            'attention_mask': encodings['attention_mask'].flatten(),
            'labels': label
        } 

In [3]:
def get_bert_embeddings(dataset, bert_model, device):
    embeddings = [] # Will store BERT embeddings for each text
    labels = [] ## for labels
    bert_model = bert_model.to(device)
    bert_model.eval()
    
    loader = DataLoader(dataset, batch_size=32, shuffle=False)
    # Disable gradient calculations since we're only doing inference not classiftion 
    with torch.no_grad():
        for batch in tqdm(loader, desc="Getting BERT embeddings"):
            input_ids = batch['input_ids'].to(device) ## batch to dvice [batch = 16 in this case _size, 256]
            attention_mask = batch['attention_mask'].to(device) ## same here for attention_mask
            
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            # Get [CLS] token embeddings
            # Extract [CLS] token embeddings
            # outputs.last_hidden_state shape: [batch_size, 256, 768]
            # [:, 0, :] selects the first token ([CLS]) of each sequence
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.extend(batch_embeddings)
            labels.extend(batch['labels'].cpu().numpy())
        # Convert lists to numpy arrays
    return np.array(embeddings), np.array(labels)

In [4]:
def main():
    df = pd.read_csv("data.csv")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'im using ',device)
    df_train , df_test = train_test_split(df,train_size=0.8,random_state=42)    
    df_train = SubjectDataset(df_train)
    df_test = SubjectDataset(df_test) 
    # train_loader = DataLoader(
    #     df_train,
    #     batch_size=32,
    #     shuffle=True,
    # )

    # test_loader = DataLoader(
    #     df_test,
    #     batch_size=32,
    #     shuffle=False,
    # )
    bert_model = BertModel.from_pretrained('bert-base-uncased')
        # Get BERT embeddings
    print("getting train enbaging bert")
    X_train, y_train = get_bert_embeddings(df_train, bert_model, device)
    print("getting test enbaging bert")
    X_test, y_test = get_bert_embeddings(df_test, bert_model, device)
     # Train Random Forest
    print("Training Random Forest")
    param_grid = {
    'bootstrap': [True, False],
    'max_depth': [5, 10, 20, 30, 40, 50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000],
    'criterion': ['gini', 'entropy']
                    }

    # Initialize base model
    rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=rf_base,
        param_grid=param_grid,
        cv=5,                  # 5-fold cross-validation
        n_jobs=-1,            # Use all CPU cores
        verbose=2,            # Detailed output
        scoring='accuracy'    # Metric to optimize
    )
    print("Starting Grid Search...")
    grid_search.fit(X_train, y_train)
    # Print best parameters and score
    print("\nBest parameters found:")
    print(grid_search.best_params_)
    print(f"\nBest cross-validation score: {grid_search.best_score_:.4f}")
    # Evaluate on test set
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)
    print("\nTest Set Performance:")
    print(classification_report(y_test, y_pred))
     # Save model (optional)
    joblib.dump(best_rf, 'news_classifier_rf.joblib')
    return bert_model , best_rf

In [5]:
if __name__ == "__main__":
    bert_model, rf_model = main()

im using  cpu
getting train enbaging bert


Getting BERT embeddings:   0%|          | 0/1123 [00:00<?, ?it/s]

KeyboardInterrupt: 